From f04f81080727febc88304b8edc17b2810806b321 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 11 Mar 2025 16:28:29 +0800
Subject: [PATCH 001/224] Normalize the sampling-based ltr with num pairs
 instead of grad. (#11322)

* Normalize the sampling-based ltr with num pairs instead of grad.

* lint.

* Cleanup.

* Define a new method.

* rt param.

* Revert "rt param."

This reverts commit d6192083f904f6327a2baeafc561d5862cf9348d.
---
 doc/parameter.rst                          |  4 ++
 doc/tutorials/learning_to_rank.rst         |  2 -
 include/xgboost/base.h                     |  8 ++-
 python-package/xgboost/testing/ranking.py  | 48 +++++++++++++++++
 src/common/ranking_utils.cuh               |  2 +
 src/common/ranking_utils.h                 | 13 +++--
 src/objective/lambdarank_obj.cc            | 21 ++++++--
 src/objective/lambdarank_obj.cu            | 61 ++++++++++++++--------
 src/objective/lambdarank_obj.cuh           |  6 ++-
 src/objective/lambdarank_obj.h             |  8 +--
 tests/cpp/objective/test_lambdarank_obj.cc | 39 +++++++-------
 tests/cpp/objective/test_lambdarank_obj.cu |  1 +
 tests/cpp/objective/test_lambdarank_obj.h  |  5 +-
 13 files changed, 154 insertions(+), 64 deletions(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index e9a309c24766..2eedf39fe9de 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -540,6 +540,10 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra
 
   Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.
 
+  .. versionchanged:: 3.1.0
+
+  When the ``mean`` method is used, it's normalized by the ``lambdarank_num_pair_per_sample`` instead of gradient.
+
 * ``lambdarank_score_normalization`` [default = ``true``]
 
   .. versionadded:: 3.0.0
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index e1c1ab85a3eb..ea5309d31ca0 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -198,8 +198,6 @@ The learning to rank implementation has been significantly updated in 2.0 with a
         # 1.7 only supports sampling, while 2.0 and later use top-k as the default.
 	# See above sections for the trade-off.
         "lambdarank_pair_method": "mean",
-        # Normalization was added in 2.0
-        "lambdarank_normalization": False,
         # 1.7 uses the ranknet loss while later versions use the NDCG weighted loss
         "objective": "rank:pairwise",
 	# 1.7 doesn't have this normalization.
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 64aab5c41b0c..4318bd808631 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -105,9 +105,13 @@ using bst_bin_t = std::int32_t;  // NOLINT
  * @brief Type for data row index (sample).
  */
 using bst_idx_t = std::uint64_t;  // NOLINT
-/*! \brief Type for tree node index. */
+/**
+ * \brief Type for tree node index.
+ */
 using bst_node_t = std::int32_t;      // NOLINT
-/*! \brief Type for ranking group index. */
+/**
+ * @brief Type for ranking group index.
+ */
 using bst_group_t = std::uint32_t;  // NOLINT
 /**
  * @brief Type for indexing into output targets.
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
index ebf88eceecf2..588c210750c8 100644
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -105,6 +105,7 @@ def run_ranking_categorical(device: str) -> None:
 def run_normalization(device: str) -> None:
     """Test normalization."""
     X, y, qid, _ = tm.make_ltr(2048, 4, 64, 3)
+    # top-k
     ltr = xgb.XGBRanker(objective="rank:pairwise", n_estimators=4, device=device)
     ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
     e0 = ltr.evals_result()
@@ -119,6 +120,53 @@ def run_normalization(device: str) -> None:
     e1 = ltr.evals_result()
     assert e1["validation_0"]["ndcg@32"][-1] > e0["validation_0"]["ndcg@32"][-1]
 
+    # mean
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=True,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e0 = ltr.evals_result()
+
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=False,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e1 = ltr.evals_result()
+    # no normalization since the number of pairs is 1.
+    assert e1["validation_0"]["ndcg"][-1] == e0["validation_0"]["ndcg"][-1]
+
+    # mean
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=True,
+        lambdarank_num_pair_per_sample=4,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e0 = ltr.evals_result()
+
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=False,
+        lambdarank_num_pair_per_sample=4,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e1 = ltr.evals_result()
+    assert e1["validation_0"]["ndcg"][-1] != e0["validation_0"]["ndcg"][-1]
+
 
 def run_score_normalization(device: str, objective: str) -> None:
     """Test normalization by score differences."""
diff --git a/src/common/ranking_utils.cuh b/src/common/ranking_utils.cuh
index 297f5157ecfb..9025dfdbc533 100644
--- a/src/common/ranking_utils.cuh
+++ b/src/common/ranking_utils.cuh
@@ -30,6 +30,8 @@ XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size
                                                           std::size_t n_pairs) {
   return group_size * n_pairs;
 }
+// Number of threads in a group divided by the number of samples in this group, returns
+// the number of pairs for pair-wise ltr with sampling.
 XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
                                                          std::size_t group_size) {
   return n_threads / group_size;
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 8d98dfb913d7..16a264fdc967 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -115,6 +115,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
   }
 
   [[nodiscard]] bool HasTruncation() const { return lambdarank_pair_method == PairMethod::kTopK; }
+  [[nodiscard]] bool IsMean() const { return lambdarank_pair_method == PairMethod::kMean; }
 
   // Used for evaluation metric and cache initialization, iterate through top-k or the whole list
   [[nodiscard]] auto TopK() const {
@@ -180,7 +181,8 @@ class RankingCache {
   HostDeviceVector<std::size_t> y_sorted_idx_cache_;
   // Cached labels sorted by the model
   HostDeviceVector<float> y_ranked_by_model_;
-  // store rounding factor for objective for each group
+  // Rounding factor for CUDA deterministic floating point summation. One rounding factor
+  // for each ranking group.
   linalg::Vector<GradientPair> roundings_;
   // rounding factor for cost
   HostDeviceVector<double> cost_rounding_;
@@ -215,6 +217,9 @@ class RankingCache {
     if (!info.weights_.Empty()) {
       CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
     }
+    if (param_.HasTruncation()) {
+      CHECK_GE(param_.NumPair(), 1);
+    }
   }
   [[nodiscard]] std::size_t MaxPositionSize() const {
     // Use truncation level as bound.
@@ -267,21 +272,21 @@ class RankingCache {
   }
 
   // CUDA cache getters, the cache is shared between metric and objective, some of these
-  // fields are lazy initialized to avoid unnecessary allocation.
+  // fields are initialized lazily to avoid unnecessary allocation.
   [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
     CHECK(!threads_group_ptr_.Empty());
     return threads_group_ptr_.ConstDeviceSpan();
   }
   [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
 
-  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
+  [[nodiscard]] linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
     if (roundings_.Size() == 0) {
       roundings_.SetDevice(ctx->Device());
       roundings_.Reshape(Groups());
     }
     return roundings_.View(ctx->Device());
   }
-  common::Span<double> CUDACostRounding(Context const* ctx) {
+  [[nodiscard]] common::Span<double> CUDACostRounding(Context const* ctx) {
     if (cost_rounding_.Size() == 0) {
       cost_rounding_.SetDevice(ctx->Device());
       cost_rounding_.Resize(1);
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index b19f72e1d46f..45ea357425b0 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -225,10 +225,23 @@ class LambdaRankObj : public FitIntercept {
     };
 
     MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
-    if (sum_lambda > 0.0 && param_.lambdarank_normalization) {
-      double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
-      std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
-                     g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
+    if (param_.lambdarank_normalization) {
+      double norm = 1.0;
+      if (param_.IsMean()) {
+        // Normalize using the number of pairs for mean.
+        auto n_pairs = this->p_cache_->Param().NumPair();
+        auto scale = 1.0 / static_cast<double>(n_pairs);
+        norm = scale;
+      } else {
+        // Normalize using gradient for top-k.
+        if (sum_lambda > 0.0) {
+          norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+        }
+      }
+      if (norm != 1.0) {
+        std::transform(linalg::begin(g_gpair), linalg::end(g_gpair), linalg::begin(g_gpair),
+                       [norm](GradientPair const& g) { return g * norm; });
+      }
     }
 
     auto w_norm = p_cache_->WeightNorm();
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index eae067a56649..8e4dc8c36252 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -4,19 +4,18 @@
  * \brief CUDA implementation of lambdarank.
  */
 #include <dmlc/registry.h>                      // for DMLC_REGISTRY_FILE_TAG
-
 #include <thrust/fill.h>                        // for fill_n
 #include <thrust/for_each.h>                    // for for_each_n
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/iterator/zip_iterator.h>       // for make_zip_iterator
 #include <thrust/tuple.h>                       // for make_tuple, tuple, tie, get
 
-#include <algorithm>                            // for min
-#include <cassert>                              // for assert
-#include <cmath>                                // for abs, log2, isinf
-#include <cstddef>                              // for size_t
-#include <cstdint>                              // for int32_t
-#include <memory>                               // for shared_ptr
+#include <algorithm>  // for min
+#include <cassert>    // for assert
+#include <cmath>      // for abs, log2, isinf
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <memory>     // for shared_ptr
 #include <utility>
 
 #include "../common/algorithm.cuh"       // for SegmentedArgSort
@@ -33,7 +32,7 @@
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for VectorView, Range, Vector
 #include "xgboost/logging.h"
-#include "xgboost/span.h"                // for Span
+#include "xgboost/span.h"  // for Span
 
 namespace xgboost::obj {
 DMLC_REGISTRY_FILE_TAG(lambdarank_obj_cu);
@@ -84,7 +83,7 @@ struct GetGradOp {
   MakePairsOp<has_truncation> make_pair;
   Delta delta;
 
-  bool need_update;
+  bool const need_update;
 
   auto __device__ operator()(std::size_t idx) -> GradCostNorm {
     auto const& args = make_pair.args;
@@ -97,6 +96,7 @@ struct GetGradOp {
     auto g_predt = args.predts.subspan(data_group_begin, n_data);
     auto g_gpair = args.gpairs.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
     auto g_rank = args.d_sorted_idx.subspan(data_group_begin, n_data);
+    auto n_pairs = args.n_pairs;
 
     auto [i, j] = make_pair(idx, g);
 
@@ -110,7 +110,9 @@ struct GetGradOp {
 
     double cost{0};
 
-    auto delta_op = [&](auto const&... args) { return delta(args..., g); };
+    auto delta_op = [&](auto const&... args) {
+      return delta(args..., g);
+    };
     GradientPair pg =
         LambdaGrad<unbiased, norm_by_diff>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
                                            args.ti_plus, args.tj_minus, &cost);
@@ -120,7 +122,6 @@ struct GetGradOp {
 
     if (need_update) {
       // second run, update the gradient
-
       auto ng = Repulse(pg);
 
       auto gr = args.d_roundings(g);
@@ -155,6 +156,7 @@ struct GetGradOp {
         }
       }
     }
+
     return thrust::make_tuple(GradientPair{std::abs(pg.GetGrad()), std::abs(pg.GetHess())},
                               std::abs(cost), -2.0 * static_cast<double>(pg.GetGrad()));
   }
@@ -217,12 +219,12 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
     auto hess = std::max(lg.GetHess(), rg.GetHess());
     auto cost = std::max(thrust::get<1>(l), thrust::get<1>(r));
     double sum_lambda = thrust::get<2>(l) + thrust::get<2>(r);
-    return thrust::make_tuple(GradientPair{std::abs(grad), std::abs(hess)}, cost, sum_lambda);
+    return thrust::make_tuple(GradientPair{grad, hess}, cost, sum_lambda);
   };
   auto init = thrust::make_tuple(GradientPair{0.0f, 0.0f}, 0.0, 0.0);
   common::Span<GradCostNorm> d_max_lambdas = p_cache->MaxLambdas<GradCostNorm>(ctx, n_groups);
   CHECK_EQ(n_groups * sizeof(GradCostNorm), d_max_lambdas.size_bytes());
-
+  // Reduce by group.
   std::size_t bytes;
   cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, d_max_lambdas.data(), n_groups,
                                      d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1,
@@ -269,22 +271,35 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
    */
   auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
   auto w_norm = p_cache->WeightNorm();
-  auto norm = p_cache->Param().lambdarank_normalization;
+  auto need_norm = p_cache->Param().lambdarank_normalization;
+  auto n_pairs = p_cache->Param().NumPair();
+  bool is_mean = p_cache->Param().IsMean();
+  CHECK_EQ(is_mean, !has_truncation);
   thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.Size(),
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
                        auto g = dh::SegmentId(d_gptr, i);
-                       auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
-                       // Normalization
-                       if (sum_lambda > 0.0 && norm) {
-                         double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+                       if (need_norm) {
+                         double norm = 1.0;
+                         if (has_truncation) {
+                           // Normalize using gradient for top-k.
+                           auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
+                           if (sum_lambda > 0.0) {
+                             norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+                           }
+                         } else {
+                           // Normalize using the number of pairs for mean.
+                           double scale = 1.0 / static_cast<double>(n_pairs);
+                           norm = scale;
+                         }
                          d_gpair(i, 0) *= norm;
                        }
+
                        d_gpair(i, 0) *= (d_weights[g] * w_norm);
                      });
 }
 
 /**
- * \brief Handles boilerplate code like getting device span.
+ * @brief Handles boilerplate code like getting device spans.
  */
 template <bool norm_by_diff, typename Delta>
 void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const& preds,
@@ -304,7 +319,6 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
   out_gpair->Reshape(preds.Size(), 1);
 
   CHECK(p_cache);
-
   auto d_rounding = p_cache->CUDARounding(ctx);
   auto d_cost_rounding = p_cache->CUDACostRounding(ctx);
 
@@ -327,9 +341,10 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
     d_y_sorted_idx = SortY(ctx, info, rank_idx, p_cache);
   }
 
-  KernelInputs args{ti_plus,        tj_minus, li,     lj,     d_gptr,     d_threads_group_ptr,
-                    rank_idx,       label,    predts, gpairs, d_rounding, d_cost_rounding.data(),
-                    d_y_sorted_idx, iter};
+  auto n_pairs = p_cache->Param().NumPair();
+  KernelInputs args{ti_plus,  tj_minus,       li,     lj,     d_gptr,     d_threads_group_ptr,
+                    rank_idx, label,          predts, gpairs, d_rounding, d_cost_rounding.data(),
+                    n_pairs,  d_y_sorted_idx, iter};
 
   // dispatch based on unbiased and truncation
   if (p_cache->Param().HasTruncation()) {
diff --git a/src/objective/lambdarank_obj.cuh b/src/objective/lambdarank_obj.cuh
index e1a78f905434..ce95304197da 100644
--- a/src/objective/lambdarank_obj.cuh
+++ b/src/objective/lambdarank_obj.cuh
@@ -66,6 +66,7 @@ struct KernelInputs {
   linalg::VectorView<GradientPair const> d_roundings;
   double const *d_cost_rounding;
 
+  ltr::position_t const n_pairs;
   common::Span<std::size_t const> d_y_sorted_idx;
 
   std::int32_t iter;
@@ -136,9 +137,10 @@ struct MakePairsOp {
     // The index pointing to the first element of the next bucket
     std::size_t right_bound = n_data - n_rights;
 
-    thrust::minstd_rand rng(args.iter);
+    std::uint32_t seed = args.iter * (static_cast<std::uint32_t>(args.d_group_ptr.size()) - 1) + g;
+    thrust::minstd_rand rng(seed);
     auto pair_idx = i;
-    rng.discard(sample_pair_idx * n_data + g + pair_idx);  // fixme
+    rng.discard(idx - args.d_threads_group_ptr[g]);  // idx within group
     thrust::uniform_int_distribution<std::size_t> dist(0, n_lefts + n_rights - 1);
     auto ridx = dist(rng);
     SPAN_CHECK(ridx < n_lefts + n_rights);
diff --git a/src/objective/lambdarank_obj.h b/src/objective/lambdarank_obj.h
index 113fce832492..56e57582eece 100644
--- a/src/objective/lambdarank_obj.h
+++ b/src/objective/lambdarank_obj.h
@@ -227,15 +227,16 @@ void MakePairs(Context const* ctx, std::int32_t iter,
   ltr::position_t cnt = group_ptr[g + 1] - group_ptr[g];
 
   if (cache->Param().HasTruncation()) {
-    for (std::size_t i = 0; i < std::min(cnt, cache->Param().NumPair()); ++i) {
+    for (std::size_t i = 0, n = std::min(cnt, cache->Param().NumPair()); i < n; ++i) {
       for (std::size_t j = i + 1; j < cnt; ++j) {
         op(i, j);
       }
     }
   } else {
     CHECK_EQ(g_rank.size(), g_label.Size());
-    std::minstd_rand rnd(iter);
-    rnd.discard(g);  // fixme(jiamingy): honor the global seed
+
+    std::uint32_t seed = iter * (static_cast<std::uint32_t>(group_ptr.size()) - 1) + g;
+    std::minstd_rand rnd(seed);
     // sort label according to the rank list
     auto it = common::MakeIndexTransformIter(
         [&g_rank, &g_label](std::size_t idx) { return g_label(g_rank[idx]); });
@@ -244,7 +245,6 @@ void MakePairs(Context const* ctx, std::int32_t iter,
     // permutation iterator to get the original label
     auto rev_it = common::MakeIndexTransformIter(
         [&](std::size_t idx) { return g_label(g_rank[y_sorted_idx[idx]]); });
-
     for (std::size_t i = 0; i < cnt;) {
       std::size_t j = i + 1;
       // find the bucket boundary
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index db8472a2a7dd..7d1639e4fb79 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -3,25 +3,26 @@
  */
 #include "test_lambdarank_obj.h"
 
-#include <gtest/gtest.h>                        // for Test, Message, TestPartResult, CmpHel...
-
-#include <algorithm>                            // for sort
-#include <cstddef>                              // for size_t
-#include <initializer_list>                     // for initializer_list
-#include <memory>                               // for unique_ptr, shared_ptr, make_shared
-#include <numeric>                              // for iota
-#include <string>                               // for char_traits, basic_string, string
-#include <vector>                               // for vector
-
-#include "../../../src/common/ranking_utils.h"  // for NDCGCache, LambdaRankParam
-#include "../helpers.h"                         // for CheckRankingObjFunction, CheckConfigReload
-#include "xgboost/base.h"                       // for GradientPair, bst_group_t, Args
-#include "xgboost/context.h"                    // for Context
-#include "xgboost/data.h"                       // for MetaInfo, DMatrix
-#include "xgboost/host_device_vector.h"         // for HostDeviceVector
-#include "xgboost/linalg.h"                     // for Tensor, All, TensorView
-#include "xgboost/objective.h"                  // for ObjFunction
-#include "xgboost/span.h"                       // for Span
+#include <gtest/gtest.h>  // for Test, Message, TestPartResult, CmpHel...
+
+#include <algorithm>         // for sort
+#include <cstddef>           // for size_t
+#include <initializer_list>  // for initializer_list
+#include <memory>            // for unique_ptr, shared_ptr, make_shared
+#include <numeric>           // for iota
+#include <string>            // for char_traits, basic_string, string
+#include <vector>            // for vector
+
+#include "../../../src/common/ranking_utils.h"      // for NDCGCache, LambdaRankParam
+#include "../../../src/objective/lambdarank_obj.h"  // for MAPStat, MakePairs
+#include "../helpers.h"                  // for CheckRankingObjFunction, CheckConfigReload
+#include "xgboost/base.h"                // for GradientPair, bst_group_t, Args
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo, DMatrix
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/linalg.h"              // for Tensor, All, TensorView
+#include "xgboost/objective.h"           // for ObjFunction
+#include "xgboost/span.h"                // for Span
 
 namespace xgboost::obj {
 TEST(LambdaRank, NDCGJsonIO) {
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index c80ec20fc63d..d33273678662 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -55,6 +55,7 @@ void TestGPUMakePair() {
         linalg::MatrixView<GradientPair>{common::Span<GradientPair>{}, {0}, DeviceOrd::CUDA(0)},
         dg,
         nullptr,
+        1,
         y_sorted_idx,
         0};
     return args;
diff --git a/tests/cpp/objective/test_lambdarank_obj.h b/tests/cpp/objective/test_lambdarank_obj.h
index 9539f1a3003e..4383a44d1a75 100644
--- a/tests/cpp/objective/test_lambdarank_obj.h
+++ b/tests/cpp/objective/test_lambdarank_obj.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2023, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
 #define XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
@@ -10,11 +10,8 @@
 #include <xgboost/objective.h>                      // for ObjFunction
 
 #include <memory>                                   // for shared_ptr, make_shared
-#include <numeric>                                  // for iota
-#include <vector>                                   // for vector
 
 #include "../../../src/common/ranking_utils.h"      // for LambdaRankParam, MAPCache
-#include "../../../src/objective/lambdarank_obj.h"  // for MAPStat
 #include "../helpers.h"                             // for EmptyDMatrix
 
 namespace xgboost::obj {

From 58e908c6649a166c3b47ea5e06e2a8539718a2b0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 11 Mar 2025 19:08:26 +0800
Subject: [PATCH 002/224] [doc] Fix version change. (#11328)

---
 doc/parameter.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 2eedf39fe9de..0125dbdae9d1 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -540,7 +540,7 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra
 
   Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.
 
-  .. versionchanged:: 3.1.0
+  .. versionchanged:: 3.0.0
 
   When the ``mean`` method is used, it's normalized by the ``lambdarank_num_pair_per_sample`` instead of gradient.
 

From a57657bb365e2d2e081b8ec77297d2f889e21517 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Wed, 12 Mar 2025 11:08:59 +0100
Subject: [PATCH 003/224] [sycl] fix init estimations (#11331)

---
 src/tree/fit_stump.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 8fdcb3131646..144abcbd8131 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -74,7 +74,7 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
   out->Reshape(n_targets);
 
   gpair.SetDevice(ctx->Device());
-  auto gpair_t = gpair.View(ctx->Device());
+  auto gpair_t = gpair.View(ctx->Device().IsSycl() ? DeviceOrd::CPU() : ctx->Device());
   ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()))
                 : cpu_impl::FitStump(ctx, info, gpair_t, out->HostView());
 }

From 5927d5da838ae9e7f5dd0e9e7a9a88dd4d8199e4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 13 Mar 2025 06:35:13 +0800
Subject: [PATCH 004/224] [EM] Disable the `on_host` option for CPU inputs.
 (#11333)

---
 src/data/sparse_page_dmatrix.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index f3a26a391d9a..be726c80b48b 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -160,11 +160,11 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
     CHECK_GE(param.max_bin, 2);
   }
   detail::CheckEmpty(batch_param_, param);
-  auto id = MakeCache(this, ".gradient_index.page", on_host_, cache_prefix_, &cache_info_);
+  auto id = MakeCache(this, ".gradient_index.page", false, cache_prefix_, &cache_info_);
   if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
     this->InitializeSparsePage(ctx);
     cache_info_.erase(id);
-    id = MakeCache(this, ".gradient_index.page", on_host_, cache_prefix_, &cache_info_);
+    id = MakeCache(this, ".gradient_index.page", false, cache_prefix_, &cache_info_);
     LOG(INFO) << "Generating new Gradient Index.";
     // Use sorted sketch for approx.
     auto sorted_sketch = param.regen;

From d63d98b2ebfc0f847a4c1b1f40a02772fe065216 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 14 Mar 2025 14:07:08 +0800
Subject: [PATCH 005/224] Auto re-coding for the CPU predictor. (#11315)

- inplace predict
- DMatrix
- Raise an error when input categories are floating points.

This PR integrates the re-coder into the CPU predictor by defining an accessor for calculating the mapped values on the fly. For a numeric-only dataset, it's a no op.
---
 python-package/xgboost/testing/data.py        |   4 +-
 python-package/xgboost/testing/ordinal.py     | 181 +++++++++-
 src/common/error_msg.h                        |   7 +
 src/data/adapter.h                            |  12 +-
 src/data/cat_container.cc                     |  24 +-
 src/data/cat_container.cu                     |  50 ++-
 src/data/cat_container.h                      |  42 ++-
 src/data/device_adapter.cuh                   |   2 +-
 src/data/simple_dmatrix.cc                    |   5 +
 src/data/simple_dmatrix.cu                    |   2 +-
 src/data/sparse_page_dmatrix.cc               |   4 +-
 src/encoder/ordinal.h                         |   7 +
 src/gbm/gbtree.cc                             |  12 +
 src/gbm/gbtree_model.cc                       |  14 +-
 src/gbm/gbtree_model.h                        |  23 +-
 src/learner.cc                                |   2 +-
 src/predictor/cpu_predictor.cc                | 317 +++++++++++-------
 src/predictor/predict_fn.h                    |  42 +++
 tests/cpp/data/test_cat_container.cu          |  30 +-
 tests/python-gpu/test_gpu_updaters.py         |  18 +-
 tests/python-gpu/test_gpu_with_sklearn.py     |  11 +-
 tests/python/test_demos.py                    |   5 +
 tests/python/test_ordinal.py                  |  25 ++
 tests/python/test_predict.py                  |   8 +-
 .../test_with_dask/test_with_dask.py          |   2 +-
 25 files changed, 673 insertions(+), 176 deletions(-)

diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 7124c48d9d0d..36367cdc26db 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -150,9 +150,11 @@ def pd_dtypes() -> Generator:
 
     # Categorical
     orig = orig.astype("category")
+    for c in orig.columns:
+        orig[c] = orig[c].cat.rename_categories(int)
     for Null in (np.nan, None, pd.NA):
         df = pd.DataFrame(
-            {"f0": [1.0, 2.0, Null, 3.0], "f1": [3.0, 2.0, Null, 1.0]},
+            {"f0": [1, 2, Null, 3], "f1": [3, 2, Null, 1]},
             dtype=pd.CategoricalDtype(),
         )
         yield orig, df
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 404d795951df..0d0ab6c21dfb 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -3,13 +3,16 @@
 
 import os
 import tempfile
-from typing import Any, Literal, Tuple, Type
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Literal, Tuple, Type, TypeVar
 
 import numpy as np
+import pytest
 
 from ..compat import import_cupy
 from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
 from ..data import _lazy_load_cudf_is_cat
+from ..training import train
 from .data import IteratorForTest, is_pd_cat_dtype, make_categorical
 
 
@@ -233,3 +236,179 @@ def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
     for _, v in cats.items():
         assert v.null_count == 0
         assert len(v) == n_cats
+
+
+def run_cat_predict(device: Literal["cpu", "cuda"]) -> None:
+    """Basic tests for re-coding during prediction."""
+    Df, _ = get_df_impl(device)
+
+    def run_basic(DMatrixT: Type) -> None:
+        df = Df({"c": ["cdef", "abc", "def"]}, dtype="category")
+        y = np.array([0, 1, 2])
+
+        codes = df.c.cat.codes
+        encoded = np.array([codes.iloc[2], codes.iloc[1]])  # used with the next df
+
+        Xy = DMatrixT(df, y, enable_categorical=True)
+        booster = train({"device": device}, Xy, num_boost_round=4)
+
+        df = Df({"c": ["def", "abc"]}, dtype="category")
+        codes = df.c.cat.codes
+
+        predt0 = booster.inplace_predict(df)
+        predt1 = booster.inplace_predict(encoded)
+
+        assert_allclose(device, predt0, predt1)
+
+        fmat = DMatrixT(df, enable_categorical=True)
+        predt2 = booster.predict(fmat)
+        assert_allclose(device, predt0, predt2)
+
+    for dm in (DMatrix, QuantileDMatrix):
+        run_basic(dm)
+
+    def run_mixed(DMatrixT: Type) -> None:
+        df = Df({"b": [2, 1, 3], "c": ["cdef", "abc", "def"]}, dtype="category")
+        y = np.array([0, 1, 2])
+
+        # used with the next df
+        b_codes = df.b.cat.codes
+        np.testing.assert_allclose(np.asarray(b_codes), np.array([1, 0, 2]))
+        # pick codes of 3, 1
+        b_encoded = np.array([b_codes.iloc[2], b_codes.iloc[1]])
+
+        c_codes = df.c.cat.codes
+        np.testing.assert_allclose(np.asarray(c_codes), np.array([1, 0, 2]))
+        # pick codes of "def", "abc"
+        c_encoded = np.array([c_codes.iloc[2], c_codes.iloc[1]])
+        encoded = np.stack([b_encoded, c_encoded], axis=1)
+
+        Xy = DMatrixT(df, y, enable_categorical=True)
+        booster = train({"device": device}, Xy, num_boost_round=4)
+
+        df = Df({"b": [3, 1], "c": ["def", "abc"]}, dtype="category")
+        predt0 = booster.inplace_predict(df)
+        predt1 = booster.inplace_predict(encoded)
+        assert_allclose(device, predt0, predt1)
+
+        fmat = DMatrixT(df, enable_categorical=True)
+        predt2 = booster.predict(fmat)
+        assert_allclose(device, predt0, predt2)
+
+    for dm in (DMatrix, QuantileDMatrix):
+        run_mixed(dm)
+
+
+def run_cat_invalid(device: Literal["cpu", "cuda"]) -> None:
+    """Basic tests for invalid inputs."""
+    Df, _ = get_df_impl(device)
+
+    def run_invalid(DMatrixT: Type) -> None:
+        df = Df({"b": [2, 1, 3], "c": ["cdef", "abc", "def"]}, dtype="category")
+        y = np.array([0, 1, 2])
+
+        Xy = DMatrixT(df, y, enable_categorical=True)
+        booster = train({"device": device}, Xy, num_boost_round=4)
+        df["b"] = df["b"].astype(np.int64)
+        with pytest.raises(ValueError, match="The data type doesn't match"):
+            booster.inplace_predict(df)
+
+        Xy = DMatrixT(df, y, enable_categorical=True)
+        with pytest.raises(ValueError, match="The data type doesn't match"):
+            booster.predict(Xy)
+
+    for dm in (DMatrix, QuantileDMatrix):
+        run_invalid(dm)
+
+
+def run_cat_thread_safety(device: Literal["cpu", "cuda"]) -> None:
+    """Basic tests for thread safety."""
+    X, y = make_categorical(2048, 16, 112, onehot=False, cat_ratio=0.5)
+    Xy = QuantileDMatrix(X, y, enable_categorical=True)
+    booster = train({"device": device}, Xy, num_boost_round=10)
+
+    def run_thread_safety(DMatrixT: Type) -> bool:
+        Xy = DMatrixT(X, enable_categorical=True)
+        predt0 = booster.predict(Xy)
+        predt1 = booster.inplace_predict(X)
+        assert_allclose(device, predt0, predt1)
+        return True
+
+    futures = []
+    for dm in (DMatrix, QuantileDMatrix):
+        with ThreadPoolExecutor(max_workers=10) as e:
+            for _ in range(10):
+                fut = e.submit(run_thread_safety, dm)
+                futures.append(fut)
+
+    for f in futures:
+        assert f.result()
+
+
+U = TypeVar("U", DMatrix, QuantileDMatrix)
+
+
+def _make_dm(DMatrixT: Type[U], ref: DMatrix, *args: Any, **kwargs: Any) -> U:
+    if DMatrixT is QuantileDMatrix:
+        return DMatrixT(*args, ref=ref, enable_categorical=True, **kwargs)
+    return DMatrixT(*args, enable_categorical=True, **kwargs)
+
+
+def _run_predt(
+    device: str,
+    DMatrixT: Type,
+    pred_contribs: bool,
+    pred_interactions: bool,
+    pred_leaf: bool,
+) -> None:
+    Df, _ = get_df_impl(device)
+
+    df = Df({"c": ["cdef", "abc", "def"]}, dtype="category")
+    y = np.array([0, 1, 2])
+
+    codes = df.c.cat.codes
+    encoded = np.array([codes.iloc[2], codes.iloc[1]])  # used with the next df
+
+    Xy = DMatrixT(df, y, enable_categorical=True)
+    booster = train({"device": device}, Xy, num_boost_round=4)
+
+    df = Df({"c": ["def", "abc"]}, dtype="category")
+    codes = df.c.cat.codes
+
+    # Contribution
+    predt0 = booster.predict(
+        _make_dm(DMatrixT, ref=Xy, data=df),
+        pred_contribs=pred_contribs,
+        pred_interactions=pred_interactions,
+        pred_leaf=pred_leaf,
+    )
+    df = Df({"c": encoded})
+    predt1 = booster.predict(
+        _make_dm(DMatrixT, ref=Xy, data=encoded.reshape(2, 1), feature_names=["c"]),
+        pred_contribs=pred_contribs,
+        pred_interactions=pred_interactions,
+        pred_leaf=pred_leaf,
+    )
+    assert_allclose(device, predt0, predt1)
+
+
+def run_cat_shap(device: Literal["cpu", "cuda"]) -> None:
+    """Basic tests for SHAP values."""
+
+    for dm in (DMatrix, QuantileDMatrix):
+        _run_predt(
+            device, dm, pred_contribs=True, pred_interactions=False, pred_leaf=False
+        )
+
+    for dm in (DMatrix, QuantileDMatrix):
+        _run_predt(
+            device, dm, pred_contribs=False, pred_interactions=True, pred_leaf=False
+        )
+
+
+def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
+    """Basic tests for leaf prediction."""
+    # QuantileDMatrix is not supported by leaf.
+    _run_predt(
+        device, DMatrix, pred_contribs=False, pred_interactions=False, pred_leaf=True
+    )
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 16652d1958ba..78168e1b1f13 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -128,5 +128,12 @@ constexpr StringView ZeroCudaMemory() {
          "support. If you are using other types of memory pool, please consider reserving a "
          "portion of the GPU memory for XGBoost.";
 }
+
+// float64 is not supported by JSON yet. Also, floating point as categories is tricky
+// since floating point equality test is inaccurate for most hardware.
+constexpr StringView NoFloatCat() {
+  return "Category index from DataFrame has floating point dtype, consider using strings or "
+         "integers instead.";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/adapter.h b/src/data/adapter.h
index a9e97b3feb1b..339fbcd90e5d 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -16,10 +16,11 @@
 #include <variant>    // for variant
 #include <vector>     // for vector
 
-#include "../common/math.h"
-#include "../encoder/ordinal.h"  // for CatStrArrayView
-#include "../encoder/types.h"    // for TupToVarT
-#include "array_interface.h"     // for CategoricalIndexArgTypes
+#include "../common/error_msg.h"  // for NoFloatCat
+#include "../common/math.h"       // for CheckNAN
+#include "../encoder/ordinal.h"   // for CatStrArrayView
+#include "../encoder/types.h"     // for TupToVarT
+#include "array_interface.h"      // for CategoricalIndexArgTypes
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/logging.h"
@@ -627,6 +628,9 @@ template <typename CategoricalIndex, bool allow_mask>
     using T = typename decltype(t)::value_type;
     constexpr bool kKnownType = enc::MemberOf<std::remove_cv_t<T>, enc::CatPrimIndexTypes>::value;
     CHECK(kKnownType) << "Unsupported categorical index type.";
+    if constexpr (std::is_floating_point_v<T>) {
+      LOG(FATAL) << error::NoFloatCat();
+    }
     auto span = common::Span{t.Values().data(), t.Size()};
     if constexpr (kKnownType) {
       p_cat_columns->emplace_back(span);
diff --git a/src/data/cat_container.cc b/src/data/cat_container.cc
index c70b7fc10579..d53eedf70fe9 100644
--- a/src/data/cat_container.cc
+++ b/src/data/cat_container.cc
@@ -9,8 +9,9 @@
 #include <utility>    // for move
 #include <vector>     // for vector
 
-#include "../encoder/types.h"  // for Overloaded
-#include "xgboost/json.h"      // for Json
+#include "../common/error_msg.h"  // for NoFloatCat
+#include "../encoder/types.h"     // for Overloaded
+#include "xgboost/json.h"         // for Json
 
 namespace xgboost {
 CatContainer::CatContainer(enc::HostColumnsView const& df) : CatContainer{} {
@@ -39,6 +40,12 @@ CatContainer::CatContainer(enc::HostColumnsView const& df) : CatContainer{} {
                      using T =
                          typename cpu_impl::ViewToStorageImpl<std::decay_t<decltype(values)>>::Type;
                      this->cpu_impl_->columns.emplace_back();
+                     using ElemT = typename T::value_type;
+
+                     if constexpr (std::is_floating_point_v<ElemT>) {
+                       LOG(FATAL) << error::NoFloatCat();
+                     }
+
                      this->cpu_impl_->columns.back().emplace<T>();
                      auto& v = std::get<T>(this->cpu_impl_->columns.back());
                      v.resize(values.size());
@@ -54,6 +61,9 @@ CatContainer::CatContainer(enc::HostColumnsView const& df) : CatContainer{} {
   CHECK(this->HostCanRead());
   CHECK_EQ(this->n_total_cats_, df.feature_segments.back());
   CHECK_GE(this->n_total_cats_, 0) << "Too many categories.";
+  if (this->n_total_cats_ > 0) {
+    CHECK(!this->cpu_impl_->columns.empty());
+  }
 }
 
 namespace {
@@ -229,17 +239,21 @@ CatContainer::CatContainer() : cpu_impl_{std::make_unique<cpu_impl::CatContainer
 
 CatContainer::~CatContainer() = default;
 
-void CatContainer::Copy(Context const*, CatContainer const& that) { this->CopyCommon(that); }
+void CatContainer::Copy(Context const* ctx, CatContainer const& that) {
+  [[maybe_unused]] auto h_view = that.HostView();
+  this->CopyCommon(ctx, that);
+  this->cpu_impl_->Copy(that.cpu_impl_.get());
+}
 
 [[nodiscard]] enc::HostColumnsView CatContainer::HostView() const { return this->HostViewImpl(); }
 
+[[nodiscard]] bool CatContainer::Empty() const { return this->cpu_impl_->columns.empty(); }
+
 void CatContainer::Sort(Context const* ctx) {
   CHECK(ctx->IsCPU());
   auto view = this->HostView();
   this->sorted_idx_.HostVector().resize(view.n_total_cats);
   enc::SortNames(enc::Policy<EncErrorPolicy>{}, view, this->sorted_idx_.HostSpan());
 }
-
-[[nodiscard]] bool CatContainer::DeviceCanRead() const { return false; }
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
diff --git a/src/data/cat_container.cu b/src/data/cat_container.cu
index fa5134905f77..64f206528cc6 100644
--- a/src/data/cat_container.cu
+++ b/src/data/cat_container.cu
@@ -141,20 +141,23 @@ CatContainer::CatContainer(DeviceOrd device, enc::DeviceColumnsView const& df) :
   if (this->n_total_cats_ > 0) {
     CHECK(this->DeviceCanRead());
     CHECK(!this->HostCanRead());
+    CHECK(!this->cu_impl_->columns.empty());
   }
 }
 
 CatContainer::~CatContainer() = default;
 
-[[nodiscard]] bool CatContainer::DeviceCanRead() const { return !this->cu_impl_->columns.empty(); }
-
 void CatContainer::Copy(Context const* ctx, CatContainer const& that) {
-  this->CopyCommon(that);
   if (ctx->IsCPU()) {
-    auto h_view = that.HostView();
-    CHECK(!h_view.Empty());
+    // Pull data to host
+    [[maybe_unused]] auto h_view = that.HostView();
+    this->CopyCommon(ctx, that);
     this->cpu_impl_->Copy(that.cpu_impl_.get());
+    CHECK(!this->DeviceCanRead());
   } else {
+    // Pull data to device
+    [[maybe_unused]] auto d_view = that.DeviceView(ctx);
+    this->CopyCommon(ctx, that);
     auto const& that_impl = that.cu_impl_;
     this->cu_impl_->columns.resize(that.cu_impl_->columns.size());
 
@@ -186,17 +189,38 @@ void CatContainer::Copy(Context const* ctx, CatContainer const& that) {
                  col);
     }
     this->cu_impl_->columns_v = h_columns_v;
+    CHECK(this->Empty() || !this->HostCanRead());
+  }
+  if (ctx->IsCPU()) {
+    CHECK_EQ(this->cpu_impl_->columns_v.size(), that.cpu_impl_->columns_v.size());
+    CHECK_EQ(this->cpu_impl_->columns.size(), that.cpu_impl_->columns.size());
+    CHECK(this->HostCanRead());
+  } else {
+    CHECK_EQ(this->cu_impl_->columns_v.size(), that.cu_impl_->columns_v.size());
+    CHECK_EQ(this->cu_impl_->columns.size(), that.cu_impl_->columns.size());
+    CHECK(this->DeviceCanRead());
   }
+  CHECK_EQ(this->Empty(), that.Empty());
+  CHECK_EQ(this->NumCatsTotal(), that.NumCatsTotal());
+}
+
+[[nodiscard]] bool CatContainer::Empty() const {
+  return this->HostCanRead() ? this->cpu_impl_->columns.empty() : this->cu_impl_->columns.empty();
 }
 
 void CatContainer::Sort(Context const* ctx) {
+  if (!this->HasCategorical()) {
+    return;
+  }
+
   if (ctx->IsCPU()) {
     auto view = this->HostView();
+    CHECK(!view.Empty()) << view.n_total_cats;
     this->sorted_idx_.HostVector().resize(view.n_total_cats);
-    enc::SortNames(enc::Policy<EncErrorPolicy>{}, view, this->sorted_idx_.HostSpan());
+    enc::SortNames(cpu_impl::EncPolicy, view, this->sorted_idx_.HostSpan());
   } else {
     auto view = this->DeviceView(ctx);
-    CHECK(!view.Empty()) << this->HostView().Size();
+    CHECK(!view.Empty()) << view.n_total_cats;
     this->sorted_idx_.SetDevice(ctx->Device());
     this->sorted_idx_.Resize(view.n_total_cats);
     enc::SortNames(cuda_impl::EncPolicy, view, this->sorted_idx_.DeviceSpan());
@@ -206,21 +230,29 @@ void CatContainer::Sort(Context const* ctx) {
 [[nodiscard]] enc::HostColumnsView CatContainer::HostView() const {
   std::lock_guard guard{device_mu_};
   if (!this->HostCanRead()) {
+    this->feature_segments_.ConstHostSpan();
     // Lazy copy to host
     this->cu_impl_->CopyTo(this->cpu_impl_.get());
   }
+  CHECK(this->HostCanRead());
   return this->HostViewImpl();
 }
 
 [[nodiscard]] enc::DeviceColumnsView CatContainer::DeviceView(Context const* ctx) const {
   CHECK(ctx->IsCUDA());
   std::lock_guard guard{device_mu_};
-  this->feature_segments_.SetDevice(ctx->Device());
   if (!this->DeviceCanRead()) {
+    this->feature_segments_.SetDevice(ctx->Device());
+    this->feature_segments_.ConstDeviceSpan();
     // Lazy copy to device
     auto h_view = this->HostViewImpl();
-    CHECK(!h_view.Empty());
     this->cu_impl_->CopyFrom(h_view);
+    CHECK_EQ(this->cu_impl_->columns_v.size(), this->cpu_impl_->columns_v.size());
+    CHECK_EQ(this->cu_impl_->columns.size(), this->cpu_impl_->columns.size());
+  }
+  CHECK(this->DeviceCanRead());
+  if (this->n_total_cats_ != 0) {
+    CHECK(!this->cu_impl_->columns_v.empty());
   }
   return {dh::ToSpan(this->cu_impl_->columns_v), this->feature_segments_.ConstDeviceSpan(),
           this->n_total_cats_};
diff --git a/src/data/cat_container.h b/src/data/cat_container.h
index b6ceed1f4219..1990e51a81f8 100644
--- a/src/data/cat_container.h
+++ b/src/data/cat_container.h
@@ -104,22 +104,37 @@ struct CatContainerImpl;
  */
 class CatContainer {
   /**
-   * @brief Implementation of the Copy method, used by both CPU and GPU.
+   * @brief Implementation of the Copy method, used by both CPU and GPU. Note that this
+   * method changes the permission in the HostDeviceVector as we need to pull data into
+   * targeted devices.
    */
-  void CopyCommon(CatContainer const& that) {
-    this->sorted_idx_.SetDevice(that.sorted_idx_.Device());
+  void CopyCommon(Context const* ctx, CatContainer const& that) {
+    auto device = ctx->Device();
+
+    that.sorted_idx_.SetDevice(device);
+    this->sorted_idx_.SetDevice(device);
     this->sorted_idx_.Resize(that.sorted_idx_.Size());
     this->sorted_idx_.Copy(that.sorted_idx_);
 
-    this->feature_segments_.SetDevice(that.feature_segments_.Device());
+    this->feature_segments_.SetDevice(device);
+    that.feature_segments_.SetDevice(device);
     this->feature_segments_.Resize(that.feature_segments_.Size());
     this->feature_segments_.Copy(that.feature_segments_);
 
     this->n_total_cats_ = that.n_total_cats_;
+
+    if (!device.IsCPU()) {
+      // Pull to device
+      this->sorted_idx_.ConstDevicePointer();
+      this->feature_segments_.ConstDevicePointer();
+    }
   }
 
   [[nodiscard]] enc::HostColumnsView HostViewImpl() const {
     CHECK_EQ(this->cpu_impl_->columns.size(), this->cpu_impl_->columns_v.size());
+    if (this->n_total_cats_ != 0) {
+      CHECK(!this->cpu_impl_->columns_v.empty());
+    }
     return {common::Span{this->cpu_impl_->columns_v}, this->feature_segments_.ConstHostSpan(),
             this->n_total_cats_};
   }
@@ -134,17 +149,21 @@ class CatContainer {
 
   void Copy(Context const* ctx, CatContainer const& that);
 
-  [[nodiscard]] bool HostCanRead() const {
-    return !this->cpu_impl_->columns.empty() || this->n_total_cats_ == 0;
-  }
-  [[nodiscard]] bool DeviceCanRead() const;
+  [[nodiscard]] bool HostCanRead() const { return this->feature_segments_.HostCanRead(); }
+  [[nodiscard]] bool DeviceCanRead() const { return this->feature_segments_.DeviceCanRead(); }
 
   // Mostly used for testing.
   void Push(cpu_impl::ColumnType const& column) { this->cpu_impl_->columns.emplace_back(column); }
-
-  [[nodiscard]] bool Empty() const { return this->cpu_impl_->columns.empty(); }
+  /**
+   * @brief Wether the container is initialized at all. If the input is not a DataFrame,
+   *        this method returns True.
+   */
+  [[nodiscard]] bool Empty() const;
 
   [[nodiscard]] std::size_t NumFeatures() const { return this->cpu_impl_->columns.size(); }
+  /**
+   * @brief The number of categories across all features.
+   */
   [[nodiscard]] std::size_t NumCatsTotal() const { return this->n_total_cats_; }
 
   /**
@@ -160,10 +179,9 @@ class CatContainer {
   [[nodiscard]] common::Span<bst_cat_t const> RefSortedIndex(Context const* ctx) const {
     std::lock_guard guard{device_mu_};
     if (ctx->IsCPU()) {
-      CHECK(this->sorted_idx_.HostCanRead());
       return this->sorted_idx_.ConstHostSpan();
     } else {
-      CHECK(this->sorted_idx_.DeviceCanRead());
+      sorted_idx_.SetDevice(ctx->Device());
       return this->sorted_idx_.ConstDeviceSpan();
     }
   }
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 6203435b8c95..672767db92c4 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -109,7 +109,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
   explicit CudfAdapter(std::string cuda_interfaces_str)
       : CudfAdapter{StringView{cuda_interfaces_str}} {}
 
-  const CudfAdapterBatch& Value() const override {
+  [[nodiscard]] CudfAdapterBatch const& Value() const override {
     CHECK_EQ(batch_.columns_.data(), columns_.data().get());
     return batch_;
   }
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 0cdaccad4109..c25bdf8befc7 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -50,10 +50,15 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
     out->Info() = this->Info().Slice(&ctx, h_ridx, h_offset.back());
   }
   out->fmat_ctx_ = this->fmat_ctx_;
+
+  out->Info().Cats()->Copy(&fmat_ctx_, *this->Info().Cats());
   return out;
 }
 
 DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
+  if (this->Cats()->HasCategorical()) {
+    LOG(FATAL) << "Slicing column is not supported for DataFrames with categorical columns.";
+  }
   auto out = new SimpleDMatrix;
   SparsePage& out_page = *out->sparse_page_;
   auto const slice_size = info_.num_col_ / num_slices;
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index 1436d982bc29..f502f5ee56c8 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -7,7 +7,7 @@
 
 #include "../common/cuda_rt_utils.h"  // for CurrentDevice
 #include "cat_container.h"            // for CatContainer
-#include "device_adapter.cuh"         // for CurrentDevice
+#include "device_adapter.cuh"
 #include "simple_dmatrix.cuh"
 #include "simple_dmatrix.h"
 #include "xgboost/context.h"  // for Context
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index be726c80b48b..160602549324 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -127,7 +127,7 @@ BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
 }
 
 BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
-  auto id = MakeCache(this, ".col.page", on_host_, cache_prefix_, &cache_info_);
+  auto id = MakeCache(this, ".col.page", false, cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
   this->InitializeSparsePage(ctx);
   if (!column_source_) {
@@ -141,7 +141,7 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
 }
 
 BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
-  auto id = MakeCache(this, ".sorted.col.page", on_host_, cache_prefix_, &cache_info_);
+  auto id = MakeCache(this, ".sorted.col.page", false, cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
   this->InitializeSparsePage(ctx);
   if (!sorted_column_source_) {
diff --git a/src/encoder/ordinal.h b/src/encoder/ordinal.h
index 83269d3c913f..bfb334d29666 100644
--- a/src/encoder/ordinal.h
+++ b/src/encoder/ordinal.h
@@ -342,6 +342,13 @@ void Recode(ExecPolicy const &policy, HostColumnsView orig_enc, Span<std::int32_
   std::size_t out_idx = 0;
   for (std::size_t f_idx = 0, n_features = orig_enc.Size(); f_idx < n_features; f_idx++) {
     bool is_empty = std::visit([](auto &&arg) { return arg.empty(); }, orig_enc.columns[f_idx]);
+    bool new_is_empty = std::visit([](auto &&arg) { return arg.empty(); }, new_enc.columns[f_idx]);
+    if (is_empty != new_is_empty) {
+      std::stringstream ss;
+      ss << "Invalid new DataFrame input for the: " << f_idx
+         << "th feature. The data type doesn't match the one used in the training dataset.";
+      policy.Error(ss.str());
+    }
     if (is_empty) {
       continue;
     }
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 8eb75323d00a..ae0b5b5020f1 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -1,5 +1,6 @@
 /**
  * Copyright 2014-2025, XGBoost Contributors
+ *
  * \file gbtree.cc
  * \brief gradient boosted tree implementation.
  * \author Tianqi Chen
@@ -212,6 +213,17 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");
 
+  // Define the categories.
+  if (this->model_.Cats()->Empty() && !p_fmat->Cats()->Empty()) {
+    auto in_cats = p_fmat->Cats();
+    this->model_.Cats()->Copy(this->ctx_, *in_cats);
+    this->model_.Cats()->Sort(this->ctx_);
+  } else {
+    CHECK_EQ(this->model_.Cats()->NumCatsTotal(), p_fmat->Cats()->NumCatsTotal())
+        << "A new dataset with different categorical features is used for training an existing "
+           "model.";
+  }
+
   predt->predictions.SetDevice(ctx_->Device());
   auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                     model_.learner_model_param->OutputLength());
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 2edb456c95de..c94c6525fea2 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include "gbtree_model.h"
 
@@ -132,6 +132,8 @@ void GBTreeModel::SaveModel(Json* p_out) const {
   std::transform(iteration_indptr.cbegin(), iteration_indptr.cend(), jiteration_indptr.begin(),
                  [](bst_tree_t i) { return Integer{i}; });
   out["iteration_indptr"] = Array{std::move(jiteration_indptr)};
+
+  this->Cats()->Save(&out["cats"]);
 }
 
 void GBTreeModel::LoadModel(Json const& in) {
@@ -142,11 +144,11 @@ void GBTreeModel::LoadModel(Json const& in) {
 
   auto const& jmodel = get<Object const>(in);
 
-  auto const& trees_json = get<Array const>(in["trees"]);
+  auto const& trees_json = get<Array const>(jmodel.at("trees"));
   CHECK_EQ(trees_json.size(), param.num_trees);
   trees.resize(param.num_trees);
 
-  auto const& tree_info_json = get<Array const>(in["tree_info"]);
+  auto const& tree_info_json = get<Array const>(jmodel.at("tree_info"));
   CHECK_EQ(tree_info_json.size(), param.num_trees);
   tree_info.resize(param.num_trees);
 
@@ -171,6 +173,12 @@ void GBTreeModel::LoadModel(Json const& in) {
     MakeIndptr(this);
   }
 
+  auto p_cats = std::make_shared<CatContainer>();
+  auto cat_it = jmodel.find("cats");
+  if (cat_it != jmodel.cend()) {
+    p_cats->Load(cat_it->second);
+  }
+  this->cats_ = std::move(p_cats);
   Validate(*this);
 }
 
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index 32fa868638bb..7d7893fb3391 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -1,6 +1,7 @@
 /**
- * Copyright 2017-2023, XGBoost Contributors
- * \file gbtree_model.h
+ * Copyright 2017-2025, XGBoost Contributors
+ *
+ * @file gbtree_model.h
  */
 #ifndef XGBOOST_GBM_GBTREE_MODEL_H_
 #define XGBOOST_GBM_GBTREE_MODEL_H_
@@ -19,6 +20,7 @@
 #include <vector>
 
 #include "../common/threading_utils.h"
+#include "../data/cat_container.h"  // for CatContainer
 
 namespace xgboost {
 
@@ -94,7 +96,7 @@ struct GBTreeModel : public Model {
 
   void InitTreesToUpdate() {
     if (trees_to_update.size() == 0u) {
-      for (auto & tree : trees) {
+      for (auto& tree : trees) {
         trees_to_update.push_back(std::move(tree));
       }
       trees.clear();
@@ -146,22 +148,27 @@ struct GBTreeModel : public Model {
   // model parameter
   GBTreeModelParam param;
   /*! \brief vector of trees stored in the model */
-  std::vector<std::unique_ptr<RegTree> > trees;
+  std::vector<std::unique_ptr<RegTree>> trees;
   /*! \brief for the update process, a place to keep the initial trees */
-  std::vector<std::unique_ptr<RegTree> > trees_to_update;
+  std::vector<std::unique_ptr<RegTree>> trees_to_update;
   /**
-   * \brief Group index for trees.
+   * @brief Group index for trees.
    */
   std::vector<int> tree_info;
   /**
-   * \brief Number of trees accumulated for each iteration.
+   * @brief Number of trees accumulated for each iteration.
    */
   std::vector<bst_tree_t> iteration_indptr{0};
 
+  [[nodiscard]] CatContainer const* Cats() const { return this->cats_.get(); }
+  [[nodiscard]] CatContainer* Cats() { return this->cats_.get(); }
+  void Cats(std::shared_ptr<CatContainer> cats) { this->cats_ = cats; }
+
  private:
   /**
-   * \brief Whether the stack contains multi-target tree.
+   * @brief Categories in the training data.
    */
+  std::shared_ptr<CatContainer> cats_{std::make_shared<CatContainer>()};
   Context const* ctx_;
 };
 }  // namespace gbm
diff --git a/src/learner.cc b/src/learner.cc
index 34f395beb34b..d45b533396db 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -843,7 +843,7 @@ class LearnerConfiguration : public Learner {
     }
   }
 
-  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) {
     base_score->Reshape(1);
     collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(),
                                 [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 9e6289c2b630..d986882a6795 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -20,6 +20,7 @@
 #include "../common/math.h"                   // for CheckNAN
 #include "../common/threading_utils.h"        // for ParallelFor
 #include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
+#include "../data/cat_container.h"            // for CatContainer
 #include "../data/gradient_index.h"           // for GHistIndexMatrix
 #include "../data/proxy_dmatrix.h"            // for DMatrixProxy
 #include "../gbm/gbtree_model.h"              // for GBTreeModel, GBTreeModelParam
@@ -96,11 +97,11 @@ void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tre
 }  // namespace multi
 
 namespace {
-void PredictByAllTrees(gbm::GBTreeModel const &model, std::uint32_t const tree_begin,
-                       std::uint32_t const tree_end, std::size_t const predict_offset,
+void PredictByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begin,
+                       bst_tree_t const tree_end, std::size_t const predict_offset,
                        std::vector<RegTree::FVec> const &thread_temp, std::size_t const offset,
                        std::size_t const block_size, linalg::MatrixView<float> out_predt) {
-  for (std::uint32_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
+  for (bst_tree_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
     auto const &tree = *model.trees.at(tree_id);
     auto const &cats = tree.GetCategoriesMatrix();
     bool has_categorical = tree.HasCategoricalSplit();
@@ -169,28 +170,35 @@ struct DataToFeatVec {
   }
 };
 
-struct SparsePageView : public DataToFeatVec<SparsePageView> {
-  bst_idx_t base_rowid;
-  HostSparsePageView view;
+template <typename EncAccessor>
+class SparsePageView : public DataToFeatVec<SparsePageView<EncAccessor>> {
+  EncAccessor acc_;
+  HostSparsePageView const view_;
 
-  explicit SparsePageView(SparsePage const *p) : base_rowid{p->base_rowid} { view = p->GetView(); }
-  [[nodiscard]] std::size_t Size() const { return view.Size(); }
+ public:
+  bst_idx_t const base_rowid;
+
+  SparsePageView(SparsePage const *p, EncAccessor &&acc)
+      : acc_{std::forward<EncAccessor>(acc)}, view_{p->GetView()}, base_rowid{p->base_rowid} {}
+  [[nodiscard]] std::size_t Size() const { return view_.Size(); }
 
   [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float *out) const {
-    auto p_data = view[ridx].data();
+    auto p_data = view_[ridx].data();
 
-    for (std::size_t i = 0, n = view[ridx].size(); i < n; ++i) {
+    for (std::size_t i = 0, n = view_[ridx].size(); i < n; ++i) {
       auto const &entry = p_data[i];
-      out[entry.index] = entry.fvalue;
+      out[entry.index] = acc_(entry);
     }
 
-    return view[ridx].size();
+    return view_[ridx].size();
   }
 };
 
-struct GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView> {
+template <typename EncAccessor>
+class GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView<EncAccessor>> {
  private:
   GHistIndexMatrix const &page_;
+  EncAccessor acc_;
   common::Span<FeatureType const> ft_;
 
   std::vector<std::uint32_t> const &ptrs_;
@@ -202,8 +210,10 @@ struct GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView> {
   bst_idx_t const base_rowid;
 
  public:
-  GHistIndexMatrixView(GHistIndexMatrix const &_page, common::Span<FeatureType const> ft)
+  GHistIndexMatrixView(GHistIndexMatrix const &_page, EncAccessor &&acc,
+                       common::Span<FeatureType const> ft)
       : page_{_page},
+        acc_{acc},
         ft_{ft},
         ptrs_{_page.cut.Ptrs()},
         mins_{_page.cut.MinValues()},
@@ -232,30 +242,30 @@ struct GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView> {
             fvalue =
                 common::HistogramCuts::NumericBinValue(this->ptrs_, values_, mins_, fidx, bin_idx);
           }
-          out[fidx] = fvalue;
+          out[fidx] = acc_(fvalue, fidx);
         }
       });
       n_non_missings += n_features;
     } else {
       for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
-        float f = std::numeric_limits<float>::quiet_NaN();
+        float fvalue = std::numeric_limits<float>::quiet_NaN();
         bool is_cat = common::IsCat(ft_, fidx);
         if (columns_.GetColumnType(fidx) == common::kSparseColumn) {
           // Special handling for extremely sparse data. Just binary search.
           auto bin_idx = page_.GetGindex(gridx, fidx);
           if (bin_idx != -1) {
             if (is_cat) {
-              f = values_[bin_idx];
+              fvalue = values_[bin_idx];
             } else {
-              f = common::HistogramCuts::NumericBinValue(this->ptrs_, values_, mins_, fidx,
-                                                         bin_idx);
+              fvalue = common::HistogramCuts::NumericBinValue(this->ptrs_, values_, mins_, fidx,
+                                                              bin_idx);
             }
           }
         } else {
-          f = page_.GetFvalue(ptrs_, values_, mins_, gridx, fidx, is_cat);
+          fvalue = page_.GetFvalue(ptrs_, values_, mins_, gridx, fidx, is_cat);
         }
-        if (!common::CheckNAN(f)) {
-          out[fidx] = f;
+        if (!common::CheckNAN(fvalue)) {
+          out[fidx] = acc_(fvalue, fidx);
           n_non_missings++;
         }
       }
@@ -263,17 +273,18 @@ struct GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView> {
     return n_non_missings;
   }
 
-  [[nodiscard]] auto Size() const { return page_.Size(); }
+  [[nodiscard]] bst_idx_t Size() const { return page_.Size(); }
 };
 
-template <typename Adapter>
-class AdapterView : public DataToFeatVec<AdapterView<Adapter>> {
+template <typename Adapter, typename EncAccessor>
+class AdapterView : public DataToFeatVec<AdapterView<Adapter, EncAccessor>> {
   Adapter const *adapter_;
   float missing_;
+  EncAccessor const &acc_;
 
  public:
-  explicit AdapterView(Adapter const *adapter, float missing)
-      : adapter_{adapter}, missing_{missing} {}
+  explicit AdapterView(Adapter const *adapter, float missing, EncAccessor const &acc)
+      : adapter_{adapter}, missing_{missing}, acc_{acc} {}
 
   [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float *out) const {
     auto const &batch = adapter_->Value();
@@ -282,20 +293,21 @@ class AdapterView : public DataToFeatVec<AdapterView<Adapter>> {
     for (size_t c = 0; c < row.Size(); ++c) {
       auto e = row.GetElement(c);
       if (missing_ != e.value && !common::CheckNAN(e.value)) {
-        out[e.column_idx] = e.value;
+        auto fvalue = this->acc_(e);
+        out[e.column_idx] = fvalue;
         n_non_missings++;
       }
     }
     return n_non_missings;
   }
 
-  [[nodiscard]] size_t Size() const { return adapter_->NumRows(); }
+  [[nodiscard]] bst_idx_t Size() const { return adapter_->NumRows(); }
 
   bst_idx_t const static base_rowid = 0;  // NOLINT
 };
 
-template <typename DataView, std::size_t kBlockOfRowsSize>
-void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &model,
+template <std::size_t kBlockOfRowsSize, typename DataView>
+void PredictBatchByBlockOfRowsKernel(DataView const &batch, gbm::GBTreeModel const &model,
                                      bst_tree_t tree_begin, bst_tree_t tree_end,
                                      std::vector<RegTree::FVec> *p_thread_temp,
                                      std::int32_t n_threads,
@@ -354,6 +366,27 @@ static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
     out->resize(nthread, RegTree::FVec());
   }
 }
+
+auto MakeCatAccessor(Context const *ctx, enc::HostColumnsView const &cats,
+                     gbm::GBTreeModel const &model) {
+  std::vector<std::int32_t> mapping(cats.n_total_cats);
+  auto sorted_idx = model.Cats()->RefSortedIndex(ctx);
+  auto orig_enc = model.Cats()->HostView();
+  enc::Recode(cpu_impl::EncPolicy, orig_enc, sorted_idx, cats, common::Span{mapping});
+  auto cats_mapping = enc::MappingView{cats.feature_segments, mapping};
+  auto acc = CatAccessor{cats_mapping};
+  return std::tuple{acc, std::move(mapping)};
+}
+
+bool ShouldUseBlock(DMatrix *p_fmat) {
+  // Threshold to use block-based prediction.
+  constexpr double kDensityThresh = .5;
+  bst_idx_t n_samples = p_fmat->Info().num_row_;
+  bst_idx_t total = std::max(n_samples * p_fmat->Info().num_col_, static_cast<bst_idx_t>(1));
+  double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
+  bool blocked = density > kDensityThresh;
+  return blocked;
+}
 }  // anonymous namespace
 
 /**
@@ -412,22 +445,25 @@ class ColumnSplitHelper {
   void PredictDMatrix(Context const *ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
+    if (this->model_.Cats()->HasCategorical()) {
+      LOG(FATAL) << "Categorical feature is not yet supported with column-split.";
+    }
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(),
                p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(ctx, SparsePageView{&batch}, out_preds);
+      PredictBatchKernel<kBlockOfRowsSize>(ctx, SparsePageView{&batch, NoOpAccessor{}}, out_preds);
     }
   }
 
-  void PredictLeaf(Context const* ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
+  void PredictLeaf(Context const *ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * (tree_end_ - tree_begin_));
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(ctx, SparsePageView{&batch},
-                                                                 out_preds);
+      PredictBatchKernel<kBlockOfRowsSize, true>(ctx, SparsePageView{&batch, NoOpAccessor{}},
+                                                 out_preds);
     }
   }
 
@@ -548,8 +584,8 @@ class ColumnSplitHelper {
     }
   }
 
-  template <typename DataView, size_t block_of_rows_size, bool predict_leaf = false>
-  void PredictBatchKernel(Context const* ctx, DataView batch, std::vector<bst_float> *out_preds) {
+  template <size_t block_of_rows_size, bool predict_leaf = false, typename DataView>
+  void PredictBatchKernel(Context const *ctx, DataView batch, std::vector<bst_float> *out_preds) {
     auto const num_group = model_.learner_model_param->num_output_group;
 
     // parallel over local batch
@@ -646,6 +682,7 @@ class CPUPredictor : public Predictor {
     if (p_fmat->Info().IsColumnSplit()) {
       CHECK(!model.learner_model_param->IsVectorLeaf())
           << "Predict DMatrix with column split" << MTNotImplemented();
+      CHECK(!model.Cats()->HasCategorical()) << "The re-coder doesn't support column split yet.";
 
       ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
       helper.PredictDMatrix(ctx_, p_fmat, out_preds);
@@ -653,46 +690,54 @@ class CPUPredictor : public Predictor {
     }
 
     auto const n_threads = this->ctx_->Threads();
-    constexpr double kDensityThresh = .5;
-    size_t total =
-        std::max(p_fmat->Info().num_row_ * p_fmat->Info().num_col_, static_cast<uint64_t>(1));
-    double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
-    bool blocked = density > kDensityThresh;
+
+    bool blocked = ShouldUseBlock(p_fmat);
 
     std::vector<RegTree::FVec> feat_vecs;
     InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
 
-    std::size_t n_samples = p_fmat->Info().num_row_;
-    std::size_t n_groups = model.learner_model_param->OutputLength();
+    // Create a writable view on the output prediction vector.
+    bst_idx_t n_groups = model.learner_model_param->OutputLength();
+    bst_idx_t n_samples = p_fmat->Info().num_row_;
     CHECK_EQ(out_preds->size(), n_samples * n_groups);
     auto out_predt = linalg::MakeTensorView(ctx_, *out_preds, n_samples, n_groups);
 
-    if (!p_fmat->PageExists<SparsePage>()) {
-      auto ft = p_fmat->Info().feature_types.ConstHostVector();
-      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
-        if (blocked) {
-          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
-              GHistIndexMatrixView{batch, ft}, model, tree_begin, tree_end, &feat_vecs, n_threads,
-              out_predt);
-        } else {
-          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
-              GHistIndexMatrixView{batch, ft}, model, tree_begin, tree_end, &feat_vecs, n_threads,
-              out_predt);
+    // Dispatching function for various configuration.
+    auto launch = [&](auto &&acc) {
+      using Enc = std::remove_reference_t<decltype(acc)>;  // The encoder.
+      if (!p_fmat->PageExists<SparsePage>()) {
+        // Run prediction on QDM.
+        auto ft = p_fmat->Info().feature_types.ConstHostVector();
+        for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
+          auto batch = GHistIndexMatrixView{page, std::forward<Enc>(acc), ft};
+          if (blocked) {
+            PredictBatchByBlockOfRowsKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
+                                                              &feat_vecs, n_threads, out_predt);
+          } else {
+            PredictBatchByBlockOfRowsKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs,
+                                               n_threads, out_predt);
+          }
         }
-      }
-    } else {
-      for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
-        if (blocked) {
-          PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
-              SparsePageView{&batch}, model, tree_begin, tree_end, &feat_vecs, n_threads,
-              out_predt);
-
-        } else {
-          PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(SparsePageView{&batch}, model,
-                                                             tree_begin, tree_end, &feat_vecs,
-                                                             n_threads, out_predt);
+      } else {
+        // Run prediction on SparsePage
+        for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
+          auto batch = SparsePageView{&page, std::forward<Enc>(acc)};
+          if (blocked) {
+            PredictBatchByBlockOfRowsKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
+                                                              &feat_vecs, n_threads, out_predt);
+          } else {
+            PredictBatchByBlockOfRowsKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs,
+                                               n_threads, out_predt);
+          }
         }
       }
+    };
+
+    if (model.Cats()->HasCategorical() && !p_fmat->Cats()->Empty()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model);
+      launch(acc);
+    } else {
+      launch(NoOpAccessor{});
     }
   }
 
@@ -769,9 +814,9 @@ class CPUPredictor : public Predictor {
     this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin, tree_end);
   }
 
-  template <typename Adapter, size_t kBlockSize>
+  template <typename Adapter>
   void DispatchedInplacePredict(std::any const &x, std::shared_ptr<DMatrix> p_m,
-                                const gbm::GBTreeModel &model, float missing,
+                                gbm::GBTreeModel const &model, float missing,
                                 PredictionCacheEntry *out_preds, bst_tree_t tree_begin,
                                 bst_tree_t tree_end) const {
     auto const n_threads = this->ctx_->Threads();
@@ -783,39 +828,59 @@ class CPUPredictor : public Predictor {
     CHECK_EQ(p_m->Info().num_col_, m->NumColumns());
     this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
 
+    bool blocked = ShouldUseBlock(p_m.get());
+
     auto &predictions = out_preds->predictions.HostVector();
     std::vector<RegTree::FVec> thread_temp;
-    InitThreadTemp(n_threads * kBlockSize, &thread_temp);
-    std::size_t n_groups = model.learner_model_param->OutputLength();
+    InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &thread_temp);
+    bst_idx_t n_groups = model.learner_model_param->OutputLength();
     auto out_predt = linalg::MakeTensorView(ctx_, predictions, m->NumRows(), n_groups);
-    PredictBatchByBlockOfRowsKernel<AdapterView<Adapter>, kBlockSize>(
-        AdapterView<Adapter>(m.get(), missing), model, tree_begin, tree_end, &thread_temp,
-        n_threads, out_predt);
+
+    auto launch = [&](auto &&acc) {
+      auto view = AdapterView{m.get(), missing, acc};
+      if (blocked) {
+        PredictBatchByBlockOfRowsKernel<kBlockOfRowsSize>(view, model, tree_begin, tree_end,
+                                                          &thread_temp, n_threads, out_predt);
+      } else {
+        PredictBatchByBlockOfRowsKernel<1>(view, model, tree_begin, tree_end, &thread_temp,
+                                           n_threads, out_predt);
+      }
+    };
+
+    if constexpr (std::is_same_v<Adapter, data::ColumnarAdapter>) {
+      // Make specialization for DataFrame where we need encoding.
+      if (model.Cats()->HasCategorical()) {
+        auto [acc, mapping] = MakeCatAccessor(ctx_, m->Cats(), model);
+        return launch(acc);
+      }
+    }
+    launch(NoOpAccessor{});
   }
 
-  bool InplacePredict(std::shared_ptr<DMatrix> p_m, const gbm::GBTreeModel &model, float missing,
+  bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const &model, float missing,
                       PredictionCacheEntry *out_preds, bst_tree_t tree_begin,
                       bst_tree_t tree_end) const override {
     auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
-    CHECK(proxy)<< error::InplacePredictProxy();
+    CHECK(proxy) << error::InplacePredictProxy();
     CHECK(!p_m->Info().IsColumnSplit())
         << "Inplace predict support for column-wise data split is not yet implemented.";
-    auto x = proxy->Adapter();
+    auto const &x = proxy->Adapter();
+
     if (x.type() == typeid(std::shared_ptr<data::DenseAdapter>)) {
-      this->DispatchedInplacePredict<data::DenseAdapter, kBlockOfRowsSize>(
-          x, p_m, model, missing, out_preds, tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::DenseAdapter>(x, p_m, model, missing, out_preds,
+                                                         tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::CSRAdapter>)) {
-      this->DispatchedInplacePredict<data::CSRAdapter, 1>(x, p_m, model, missing, out_preds,
-                                                          tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::CSRAdapter>(x, p_m, model, missing, out_preds,
+                                                       tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::ArrayAdapter>)) {
-      this->DispatchedInplacePredict<data::ArrayAdapter, kBlockOfRowsSize>(
-          x, p_m, model, missing, out_preds, tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::ArrayAdapter>(x, p_m, model, missing, out_preds,
+                                                         tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::CSRArrayAdapter>)) {
-      this->DispatchedInplacePredict<data::CSRArrayAdapter, 1>(x, p_m, model, missing, out_preds,
-                                                               tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::CSRArrayAdapter>(x, p_m, model, missing, out_preds,
+                                                            tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::ColumnarAdapter>)) {
-      this->DispatchedInplacePredict<data::ColumnarAdapter, kBlockOfRowsSize>(
-          x, p_m, model, missing, out_preds, tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::ColumnarAdapter>(x, p_m, model, missing, out_preds,
+                                                            tree_begin, tree_end);
     } else {
       return false;
     }
@@ -834,27 +899,29 @@ class CPUPredictor : public Predictor {
     if (p_fmat->Info().IsColumnSplit()) {
       CHECK(!model.learner_model_param->IsVectorLeaf())
           << "Predict leaf with column split" << MTNotImplemented();
-
+      CHECK(!model.Cats()->HasCategorical())
+          << "Categorical feature is not yet supported with column-split.";
       ColumnSplitHelper helper(n_threads, model, 0, ntree_limit);
       helper.PredictLeaf(ctx_, p_fmat, &preds);
       return;
     }
 
     std::vector<RegTree::FVec> feat_vecs;
-    const int num_feature = model.learner_model_param->num_feature;
+    const int n_features = model.learner_model_param->num_feature;
     InitThreadTemp(n_threads, &feat_vecs);
-    // start collecting the prediction
-    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-      // parallel over local batch
-      auto page = batch.GetView();
+
+    auto launch = [&](SparsePage const &page, auto &&acc) {
+      using Enc = std::remove_reference_t<decltype(acc)>;  // The encoder.
       common::ParallelFor(page.Size(), n_threads, [&](auto i) {
-        const int tid = omp_get_thread_num();
-        auto ridx = static_cast<size_t>(batch.base_rowid + i);
+        auto tid = omp_get_thread_num();
+        auto ridx = static_cast<bst_idx_t>(page.base_rowid + i);
         RegTree::FVec &feats = feat_vecs[tid];
         if (feats.Size() == 0) {
-          feats.Init(num_feature);
+          feats.Init(n_features);
         }
-        feats.Fill(page[i]);
+        SparsePageView view{&page, std::forward<Enc>(acc)};
+        view.Fill(i, &feats);
+
         for (bst_tree_t j = 0; j < ntree_limit; ++j) {
           auto const &tree = *model.trees[j];
           auto const &cats = tree.GetCategoriesMatrix();
@@ -868,6 +935,17 @@ class CPUPredictor : public Predictor {
         }
         feats.Drop();
       });
+    };
+
+    // Start collecting the prediction
+    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+      // parallel over local batch
+      if (model.Cats()->HasCategorical() && !p_fmat->Cats()->Empty()) {
+        auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model);
+        launch(batch, std::move(acc));
+      } else {
+        launch(batch, NoOpAccessor{});
+      }
     }
   }
 
@@ -897,20 +975,30 @@ class CPUPredictor : public Predictor {
     common::ParallelFor(ntree_limit, n_threads, [&](bst_omp_uint i) {
       FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
     });
-    // start collecting the contributions
-    if (!p_fmat->PageExists<SparsePage>()) {
-      auto ft = p_fmat->Info().feature_types.ConstHostVector();
-      for (const auto &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
-        PredictContributionKernel(GHistIndexMatrixView{batch, ft}, info, model, tree_weights,
-                                  &mean_values, &feat_vecs, &contribs, ntree_limit, approximate,
-                                  condition, condition_feature);
+
+    auto launch = [&](auto &&acc) {
+      // Start collecting the contributions
+      using Enc = std::remove_reference_t<decltype(acc)>;  // The encoder.
+      if (!p_fmat->PageExists<SparsePage>()) {
+        auto ft = p_fmat->Info().feature_types.ConstHostVector();
+        for (const auto &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
+          PredictContributionKernel(GHistIndexMatrixView{batch, std::forward<Enc>(acc), ft}, info,
+                                    model, tree_weights, &mean_values, &feat_vecs, &contribs,
+                                    ntree_limit, approximate, condition, condition_feature);
+        }
+      } else {
+        for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+          PredictContributionKernel(SparsePageView{&batch, std::forward<Enc>(acc)}, info, model,
+                                    tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit,
+                                    approximate, condition, condition_feature);
+        }
       }
+    };
+    if (model.Cats()->HasCategorical() && !p_fmat->CatsShared()->Empty()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model);
+      launch(acc);
     } else {
-      for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-        PredictContributionKernel(
-            SparsePageView{&batch}, info, model, tree_weights, &mean_values, &feat_vecs,
-            &contribs, ntree_limit, approximate, condition, condition_feature);
-      }
+      launch(NoOpAccessor{});
     }
   }
 
@@ -923,8 +1011,8 @@ class CPUPredictor : public Predictor {
     CHECK(!p_fmat->Info().IsColumnSplit()) << "Predict interaction contribution support for "
                                               "column-wise data split is not yet implemented.";
     const MetaInfo& info = p_fmat->Info();
-    const int ngroup = model.learner_model_param->num_output_group;
-    size_t const ncolumns = model.learner_model_param->num_feature;
+    auto const ngroup = model.learner_model_param->num_output_group;
+    auto const ncolumns = model.learner_model_param->num_feature;
     const unsigned row_chunk = ngroup * (ncolumns + 1) * (ncolumns + 1);
     const unsigned mrow_chunk = (ncolumns + 1) * (ncolumns + 1);
     const unsigned crow_chunk = ngroup * (ncolumns + 1);
@@ -951,7 +1039,7 @@ class CPUPredictor : public Predictor {
                           tree_weights, approximate, 1, i);
 
       for (size_t j = 0; j < info.num_row_; ++j) {
-        for (int l = 0; l < ngroup; ++l) {
+        for (std::remove_const_t<decltype(ngroup)> l = 0; l < ngroup; ++l) {
           const unsigned o_offset = j * row_chunk + l * mrow_chunk + i * (ncolumns + 1);
           const unsigned c_offset = j * crow_chunk + l * (ncolumns + 1);
           contribs[o_offset + i] = 0;
@@ -960,7 +1048,8 @@ class CPUPredictor : public Predictor {
             if (k == i) {
               contribs[o_offset + i] += contribs_diag[c_offset + k];
             } else {
-              contribs[o_offset + k] = (contribs_on[c_offset + k] - contribs_off[c_offset + k])/2.0;
+              contribs[o_offset + k] =
+                  (contribs_on[c_offset + k] - contribs_off[c_offset + k]) / 2.0;
               contribs[o_offset + i] -= contribs[o_offset + k];
             }
           }
@@ -970,7 +1059,7 @@ class CPUPredictor : public Predictor {
   }
 
  private:
-  static size_t constexpr kBlockOfRowsSize = 64;
+  static std::size_t constexpr kBlockOfRowsSize = 64;
 };
 
 XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h
index e3be91d5fa3f..1b00add3e827 100644
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -8,6 +8,7 @@
 #include <vector>  // for vector
 
 #include "../common/categorical.h"  // for IsCat, Decision
+#include "../data/adapter.h"        // for COOTuple
 #include "xgboost/tree_model.h"     // for RegTree
 
 namespace xgboost::predictor {
@@ -64,5 +65,46 @@ inline bst_tree_t GetTreeLimit(std::vector<std::unique_ptr<RegTree>> const &tree
   }
   return ntree_limit;
 }
+
+/**
+ * @brief Accessor for obtaining re-coded categories.
+ */
+struct CatAccessor {
+  enc::MappingView enc;
+
+  template <typename T, typename Fidx>
+  [[nodiscard]] XGBOOST_DEVICE T operator()(T fvalue, Fidx f_idx) const {
+    if (!enc.Empty() && !enc[f_idx].empty()) {
+      auto f_mapping = enc[f_idx];
+      auto cat_idx = common::AsCat(fvalue);
+      if (cat_idx >= 0 && cat_idx < common::AsCat(f_mapping.size())) {
+        fvalue = f_mapping.data()[cat_idx];
+      }
+    }
+    return fvalue;
+  }
+
+  [[nodiscard]] XGBOOST_DEVICE float operator()(data::COOTuple const &e) const {
+    return this->operator()(e.value, e.column_idx);
+  }
+
+  [[nodiscard]] XGBOOST_DEVICE float operator()(Entry const &e) const {
+    return this->operator()(e.fvalue, e.index);
+  }
+};
+
+/**
+ * @brief No-op accessor used to handle numeric data.
+ */
+struct NoOpAccessor {
+  XGBOOST_DEVICE explicit NoOpAccessor(enc::MappingView const &) {}
+  NoOpAccessor() = default;
+  template <typename T, typename Fidx>
+  [[nodiscard]] XGBOOST_DEVICE T operator()(T fvalue, Fidx) const {
+    return fvalue;
+  }
+  [[nodiscard]] XGBOOST_DEVICE float operator()(data::COOTuple const &e) const { return e.value; }
+  [[nodiscard]] XGBOOST_DEVICE float operator()(Entry const &e) const { return e.fvalue; }
+};
 }  // namespace xgboost::predictor
 #endif  // XGBOOST_PREDICTOR_PREDICT_FN_H_
diff --git a/tests/cpp/data/test_cat_container.cu b/tests/cpp/data/test_cat_container.cu
index 860d386464d7..965135abbe16 100644
--- a/tests/cpp/data/test_cat_container.cu
+++ b/tests/cpp/data/test_cat_container.cu
@@ -3,10 +3,15 @@
  */
 
 #include <gtest/gtest.h>
+#include <xgboost/base.h>  // for bst_cat_t
+#include <xgboost/span.h>  // for Span
 
-#include "../../../src/common/common.h"
+#include <vector>  // for vector
+
+#include "../../../src/common/common.h"           // for safe_cuda
+#include "../../../src/common/threading_utils.h"  // for ParallelFor
 #include "../encoder/df_mock.h"
-#include "../helpers.h"
+#include "../helpers.h"  // for MakeCUDACtx
 #include "test_cat_container.h"
 
 namespace xgboost {
@@ -30,4 +35,25 @@ TEST(CatContainer, MixedGpu) {
   auto ctx = MakeCUDACtx(0);
   auto df = TestCatContainerMixed<DfTest>(&ctx, eq_check);
 }
+
+TEST(CatContainer, ThreadSafety) {
+  auto ctx = MakeCUDACtx(0);
+  auto df = DfTest::Make(DfTest::MakeStrs("abc", "bcd", "cde", "ab"), DfTest::MakeInts(2, 2, 3, 0));
+  auto h_df = df.View();
+  auto cats = test_cat_detail::FromDf(&ctx, h_df);
+  cats.Sort(&ctx);  // not thread safe
+
+  common::ParallelFor(ctx.Threads(), 64, [&](auto i) {
+    auto sorted_idx = cats.RefSortedIndex(&ctx);
+    if (i % 2 == 0) {
+      auto h_cats = cats.HostView();
+      ASSERT_EQ(h_cats.n_total_cats, 8);
+    } else {
+      auto d_cats = cats.DeviceView(&ctx);
+      ASSERT_EQ(d_cats.n_total_cats, 8);
+    }
+    auto sol = std::vector<bst_cat_t>{3, 0, 1, 2, 3, 0, 1, 2};
+    eq_check(sorted_idx, sol);
+  });
+}
 }  // namespace xgboost
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 340188b23652..0d1a48201ab9 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -34,7 +34,8 @@ class TestGPUUpdatersMulti:
     )
     @settings(deadline=None, max_examples=50, print_blob=True)
     def test_hist(self, param, num_rounds, dataset):
-        param["tree_method"] = "gpu_hist"
+        param["tree_method"] = "hist"
+        param["device"] = "cuda"
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(str(result))
@@ -208,7 +209,8 @@ def test_categorical_ames_housing(
         dataset = tm.TestDataset(
             "ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
         )
-        cat_parameters["tree_method"] = "gpu_hist"
+        cat_parameters["tree_method"] = "hist"
+        cat_parameters["device"] = "cuda"
         results = train_result(cat_parameters, dataset.get_dmat(), 16)
         tm.non_increasing(results["train"]["rmse"])
 
@@ -260,7 +262,8 @@ def test_gpu_hist_device_dmatrix(
     ) -> None:
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
-        param["tree_method"] = "gpu_hist"
+        param["tree_method"] = "hist"
+        param["device"] = "cuda"
         param = dataset.set_params(param)
         result = train_result(
             param,
@@ -281,7 +284,8 @@ def test_external_memory(self, param, num_rounds, dataset):
             return
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
-        param["tree_method"] = "gpu_hist"
+        param["tree_method"] = "hist"
+        param["device"] = "cuda"
         param = dataset.set_params(param)
         m = dataset.get_external_dmat()
         external_result = train_result(param, m, num_rounds)
@@ -317,8 +321,10 @@ def test_empty_dmatrix_prediction(self):
     @pytest.mark.mgpu
     @given(tm.make_dataset_strategy(), strategies.integers(0, 10))
     @settings(deadline=None, max_examples=10, print_blob=True)
-    def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
-        param = {"tree_method": "gpu_hist", "gpu_id": gpu_id}
+    def test_specified_gpu_id_gpu_update(
+        self, dataset: tm.TestDataset, gpu_id: int
+    ) -> None:
+        param = {"tree_method": "hist", "device": f"cuda:{gpu_id}"}
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), 10)
         assert tm.non_increasing(result["train"][dataset.metric])
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index a01e79ccc88a..0e5ac2f6d0a7 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -80,13 +80,18 @@ def test_categorical():
     from sklearn.datasets import load_svmlight_file
 
     data_dir = tm.data_dir(__file__)
-    X, y = load_svmlight_file(os.path.join(data_dir, "agaricus.txt.train"))
+    X, y = load_svmlight_file(
+        os.path.join(data_dir, "agaricus.txt.train"), dtype=np.float32
+    )
     clf = xgb.XGBClassifier(
-        tree_method="gpu_hist",
+        tree_method="hist",
+        device="cuda",
         enable_categorical=True,
         n_estimators=10,
     )
     X = pd.DataFrame(X.todense()).astype("category")
+    for c in X.columns:
+        X[c] = X[c].cat.rename_categories(int)
     clf.fit(X, y)
 
     with tempfile.TemporaryDirectory() as tempdir:
@@ -105,7 +110,7 @@ def test_categorical():
 
     def check_predt(X, y):
         reg = xgb.XGBRegressor(
-            tree_method="gpu_hist", enable_categorical=True, n_estimators=64
+            tree_method="hist", enable_categorical=True, n_estimators=64, device="cuda"
         )
         reg.fit(X, y)
         predts = reg.predict(X)
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index d20e5bc384cc..9f7bd7123fde 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -228,6 +228,8 @@ def test_cli_regression_demo() -> None:
     subprocess.check_call(cmd, cwd=reg_dir)
 
     exe = os.path.join(DEMO_DIR, os.path.pardir, "xgboost")
+    if not os.path.exists(exe):
+        pytest.skip("CLI executable not found.")
     conf = os.path.join(reg_dir, "machine.conf")
     subprocess.check_call([exe, conf], cwd=reg_dir)
 
@@ -237,6 +239,9 @@ def test_cli_regression_demo() -> None:
 )
 def test_cli_binary_classification() -> None:
     cls_dir = os.path.join(CLI_DEMO_DIR, "binary_classification")
+    exe = os.path.join(DEMO_DIR, os.path.pardir, "xgboost")
+    if not os.path.exists(exe):
+        pytest.skip("CLI executable not found.")
     with tm.DirectoryExcursion(cls_dir, cleanup=True):
         subprocess.check_call(["./runexp.sh"])
         os.remove("0002.model")
diff --git a/tests/python/test_ordinal.py b/tests/python/test_ordinal.py
index 6863733f2d47..05cd641693a5 100644
--- a/tests/python/test_ordinal.py
+++ b/tests/python/test_ordinal.py
@@ -5,6 +5,11 @@
     run_cat_container,
     run_cat_container_iter,
     run_cat_container_mixed,
+    run_cat_invalid,
+    run_cat_leaf,
+    run_cat_predict,
+    run_cat_shap,
+    run_cat_thread_safety,
 )
 
 pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_pandas()))
@@ -20,3 +25,23 @@ def test_cat_container_mixed() -> None:
 
 def test_cat_container_iter() -> None:
     run_cat_container_iter("cpu")
+
+
+def test_cat_predict() -> None:
+    run_cat_predict("cpu")
+
+
+def test_cat_invalid() -> None:
+    run_cat_invalid("cpu")
+
+
+def test_cat_thread_safety() -> None:
+    run_cat_thread_safety("cpu")
+
+
+def test_cat_shap() -> None:
+    run_cat_shap("cpu")
+
+
+def test_cat_leaf() -> None:
+    run_cat_leaf("cpu")
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index 4a81e807bfa3..fc330962cde9 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -1,6 +1,7 @@
 """Tests for running inplace prediction."""
 
 from concurrent.futures import ThreadPoolExecutor
+from typing import List, Union
 
 import numpy as np
 import pandas as pd
@@ -251,11 +252,14 @@ def test_dtypes(self) -> None:
 
     @pytest.mark.skipif(**tm.no_pandas())
     def test_pd_dtypes(self) -> None:
+        import pandas as pd
         from pandas.api.types import is_bool_dtype
 
         for orig, x in pd_dtypes():
-            dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
-            if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):
+            dtypes: Union[List, pd.Series] = (
+                orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
+            )
+            if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes.iloc[0]):
                 continue
             y = np.arange(x.shape[0])
             Xy = xgb.DMatrix(orig, y, enable_categorical=True)
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 90a84a0090c1..0eccf1f46c67 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1,4 +1,4 @@
-"""Copyright 2019-2024, XGBoost contributors"""
+"""Copyright 2019-2025, XGBoost contributors"""
 
 import asyncio
 import json

From 18c91fa62078d5b8d48a3f23748412c7e45a4df8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 15 Mar 2025 13:56:21 +0800
Subject: [PATCH 006/224] Update release script for sdist. (#11337) (#11340)

---
 .../script/release_artifacts.py               | 52 +++++++++----------
 1 file changed, 24 insertions(+), 28 deletions(-)
 rename dev/release-artifacts.py => ops/script/release_artifacts.py (92%)

diff --git a/dev/release-artifacts.py b/ops/script/release_artifacts.py
similarity index 92%
rename from dev/release-artifacts.py
rename to ops/script/release_artifacts.py
index fc6c0f3b1307..52963fac8ded 100644
--- a/dev/release-artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -5,39 +5,30 @@
 """
 
 import argparse
-import os
 import shutil
 import subprocess
 import tarfile
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple
 from urllib.request import urlretrieve
 
 import tqdm
 from packaging import version
+from pypi_variants import make_pyproject
 from sh.contrib import git
+from test_utils import PY_PACKAGE
+from test_utils import ROOT as root_path
+from test_utils import DirectoryExcursion
 
 # S3 bucket hosting the release artifacts
 S3_BUCKET_URL = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
-ROOT = Path(__file__).absolute().parent.parent
-DIST = ROOT / "python-package" / "dist"
+DIST = Path(PY_PACKAGE) / "dist"
+ROOT = Path(root_path)
 
 pbar = None
 
 
-class DirectoryExcursion:
-    def __init__(self, path: Path) -> None:
-        self.path = path
-        self.curdir = Path.cwd().resolve()
-
-    def __enter__(self) -> None:
-        os.chdir(self.path)
-
-    def __exit__(self, *args: Any) -> None:
-        os.chdir(self.curdir)
-
-
 def show_progress(block_num: int, block_size: int, total_size: int) -> None:
     """Show file download progress."""
     global pbar
@@ -118,16 +109,24 @@ def make_python_sdist(
     dist_dir = outdir / "dist"
     dist_dir.mkdir(exist_ok=True)
 
-    # Apply patch to remove NCCL dependency
-    # Save the original content of pyproject.toml so that we can restore it later
+    # Build sdist for `xgboost-cpu`.
     with DirectoryExcursion(ROOT):
-        with open("python-package/pyproject.toml", "r") as f:
-            orig_pyproj_lines = f.read()
-        with open("ops/patch/remove_nccl_dep.patch", "r") as f:
-            patch_lines = f.read()
-        subprocess.run(
-            ["patch", "-p0"], input=patch_lines, check=True, text=True, encoding="utf-8"
+        make_pyproject("cpu")
+    with DirectoryExcursion(ROOT / "python-package"):
+        subprocess.run(["python", "-m", "build", "--sdist"], check=True)
+        sdist_name = (
+            f"xgboost_cpu-{release}{rc}{rc_ver}.tar.gz"
+            if rc
+            else f"xgboost_cpu-{release}.tar.gz"
         )
+        src = DIST / sdist_name
+        subprocess.run(["twine", "check", str(src)], check=True)
+        dest = dist_dir / sdist_name
+        shutil.move(src, dest)
+
+    # Build sdist for `xgboost`.
+    with DirectoryExcursion(ROOT):
+        make_pyproject("default")
 
     with DirectoryExcursion(ROOT / "python-package"):
         subprocess.run(["python", "-m", "build", "--sdist"], check=True)
@@ -141,10 +140,6 @@ def make_python_sdist(
         dest = dist_dir / sdist_name
         shutil.move(src, dest)
 
-    with DirectoryExcursion(ROOT):
-        with open("python-package/pyproject.toml", "w") as f:
-            f.write(orig_pyproj_lines)
-
 
 def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     """Download all Python binary wheels for the specified branch."""
@@ -318,6 +313,7 @@ def main(args: argparse.Namespace) -> None:
         rc_ver: Optional[int] = None
     else:
         # RC release
+        assert release_parsed.pre is not None
         rc, rc_ver = release_parsed.pre
         if rc != "rc":
             raise ValueError(

From 429f81279ce2ed6d79c23f884fbf6cb9c926768f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 15 Mar 2025 18:55:37 +0800
Subject: [PATCH 007/224] 3.0 release note. (#11285)

---
 doc/changes/index.rst   |   1 +
 doc/changes/v3.0.0.rst  | 368 ++++++++++++++++++++++++++++++++++++++++
 doc/conf.py             |   7 +-
 doc/requirements.txt    |   1 +
 ops/script/changelog.py |  32 ++++
 5 files changed, 408 insertions(+), 1 deletion(-)
 create mode 100644 doc/changes/v3.0.0.rst
 create mode 100644 ops/script/changelog.py

diff --git a/doc/changes/index.rst b/doc/changes/index.rst
index 09bc215075e4..c1e155ca0421 100644
--- a/doc/changes/index.rst
+++ b/doc/changes/index.rst
@@ -8,4 +8,5 @@ For release notes prior to the 2.1 release, please see `news <https://github.com
   :maxdepth: 1
   :caption: Contents:
 
+  v3.0.0
   v2.1.0
\ No newline at end of file
diff --git a/doc/changes/v3.0.0.rst b/doc/changes/v3.0.0.rst
new file mode 100644
index 000000000000..e236ba3132f7
--- /dev/null
+++ b/doc/changes/v3.0.0.rst
@@ -0,0 +1,368 @@
+###################
+3.0.0 (2025 Feb 27)
+###################
+
+3.0.0 is a milestone for XGBoost. This note will summarize some general changes and then
+list package-specific updates. The bump in the major version is for a reworked R package
+along with a significant update to the JVM packages.
+
+.. contents::
+  :backlinks: none
+  :local:
+
+***********************
+External Memory Support
+***********************
+
+This release features a major update to the external memory implementation with improved
+performance, a new :py:class:`~xgboost.ExtMemQuantileDMatrix` for more efficient data
+initialization, new feature coverage including categorical data support and quantile
+regression support. Additionally, GPU-based external memory is reworked to support using
+CPU memory as a data cache. Last but not least, we worked on distributed training using
+external memory along with the spark package's initial support.
+
+- A new :py:class:`~xgboost.ExtMemQuantileDMatrix` class for fast data initialization with
+  the ``hist`` tree method. The new class supports both CPU and GPU training. (:pr:`10689`,
+  :pr:`10682`, :pr:`10886`, :pr:`10860`, :pr:`10762`, :pr:`10694`, :pr:`10876`)
+- External memory now supports distributed training (:pr:`10492`, :pr:`10861`). In addition, the
+  Spark package can use external memory (the host memory) when the device is GPU. The
+  default package on maven doesn't support RMM yet. For better performance, one needs
+  to compile XGBoost from the source for now. (:pr:`11186`, :pr:`11238`, :pr:`11219`)
+- Improved performance with new optimizations for both the ``hist``-specific training and
+  the ``approx`` (:py:class:`~xgboost.DMatrix`) method. (:pr:`10529`, :pr:`10980`, :pr:`10342`)
+- New demos and documents for external memory, including distributed training. (:pr:`11234`,
+  :pr:`10929`, :pr:`10916`, :pr:`10426`, :pr:`11113`)
+- Reduced binary cache size and memory allocation overhead by not writing the cut matrix. (:pr:`10444`)
+- More feature coverage, including categorical data and all objective functions, including
+  quantile regression. In addition, various prediction types like SHAP values are
+  supported. (:pr:`10918`, :pr:`10820`, :pr:`10751`, :pr:`10724`)
+
+Significant updates for the GPU-based external memory training implementation. (:pr:`10924`,
+:pr:`10895`, :pr:`10766`, :pr:`10544`, :pr:`10677`, :pr:`10615`, :pr:`10927`, :pr:`10608`, :pr:`10711`)
+
+- GPU-based external memory supports both batch-based and sampling-based training. Before
+  the 3.0 release, XGBoost concatenates the data during training and stores the cache on
+  disk. In 3.0, XGBoost can now stage the data on the host and fetch them by
+  batch. (:pr:`10602`, :pr:`10595`, :pr:`10606`, :pr:`10549`, :pr:`10488`, :pr:`10766`,
+  :pr:`10765`, :pr:`10764`, :pr:`10760`, :pr:`10753`, :pr:`10734`, :pr:`10691`,
+  :pr:`10713`, :pr:`10826`, :pr:`10811`, :pr:`10810`, :pr:`10736`, :pr:`10538`,
+  :pr:`11333`)
+- XGBoost can now utilize `NVLink-C2C` for GPU-based external memory training and can
+  handle up to terabytes of data.
+- Support prediction cache (:pr:`10707`).
+- Automatic page concatenation for improved GPU utilization (:pr:`10887`).
+- Improved quantile sketching algorithm for batch-based inputs. See the section for
+  :ref:`new features <3_0_features>` for more info.
+- Optimization for nearly-dense input, see the section for :ref:`optimization
+  <3_0_optimization>` for more info.
+
+See our latest document for details :doc:`/tutorials/external_memory`. The PyPI package
+(``pip install``) doesn't have ``RMM`` support, which is required by the GPU external
+memory implementation. To experiment, you can compile XGBoost from source or wait for the
+RAPIDS conda package to be available.
+
+.. _3_0_networking:
+
+**********
+Networking
+**********
+
+Continuing the work from the previous release, we updated the network module to improve
+reliability. (:pr:`10453`, :pr:`10756`, :pr:`11111`, :pr:`10914`, :pr:`10828`, :pr:`10735`, :pr:`10693`, :pr:`10676`, :pr:`10349`,
+:pr:`10397`, :pr:`10566`, :pr:`10526`, :pr:`10349`)
+
+The timeout option is now supported for NCCL using the NCCL asynchronous mode (:pr:`10850`,
+:pr:`10934`, :pr:`10945`, :pr:`10930`).
+
+In addition, a new :py:class:`~xgboost.collective.Config` class is added for users to
+specify various options including timeout, tracker port, etc for distributed
+training. Both the Dask interface and the PySpark interface support the new
+configuration. (:pr:`11003`, :pr:`10281`, :pr:`10983`, :pr:`10973`)
+
+****
+SYCL
+****
+
+Continuing the work on the SYCL integration, there are significant improvements in the
+feature coverage for this release from more training parameters and more objectives to
+distributed training, along with various optimization (:pr:`10884`, :pr:`10883`).
+
+Starting with 3.0, the SYCL-plugin is close to feature-complete, users can start working
+on SYCL devices for in-core training and inference. Newly introduced features include:
+
+- Dask support for distributed training (:pr:`10812`)
+
+- Various training procedures, including split evaluation (:pr:`10605`, :pr:`10636`), grow policy
+  (:pr:`10690`, :pr:`10681`), cached prediction (:pr:`10701`).
+
+- Updates for objective functions. (:pr:`11029`, :pr:`10931`, :pr:`11016`, :pr:`10993`, :pr:`11064`, :pr:`10325`)
+
+- On-going work for float32-only devices.  (:pr:`10702`)
+
+Other related PRs (:pr:`10842`, :pr:`10543`, :pr:`10806`, :pr:`10943`, :pr:`10987`, :pr:`10548`, :pr:`10922`, :pr:`10898`, :pr:`10576`)
+
+.. _3_0_features:
+
+********
+Features
+********
+
+This section describes new features in the XGBoost core. For language-specific features,
+please visit corresponding sections.
+
+- A new initialization method for objectives that are derived from GLM. The new method is
+  based on the mean value of the input labels. The new method changes the result of the
+  estimated ``base_score``. (:pr:`10298`, :pr:`11331`)
+
+- The :py:class:`xgboost.QuantileDMatrix` can be used with all prediction types for both
+  CPU and GPU.
+
+- In prior releases, XGBoost makes a copy for the booster to release memory held by
+  internal tree methods. We formalize the procedure into a new booster method
+  :py:meth:`~xgboost.Booster.reset` / :cpp:func:`XGBoosterReset`. (:pr:`11042`)
+
+- OpenMP thread setting is exposed to the XGBoost global configuration. Users can use it
+  to workaround hardcoded OpenMP environment variables. (:pr:`11175`)
+
+- We improved learning to rank tasks for better hyper-parameter configuration and for
+  distributed training.
+
+  + In 3.0, all three distributed interfaces, including Dask, Spark, and PySpark, support
+    sorting the data based on query ID. The option for the
+    :py:class:`~xgboost.dask.DaskXGBRanker` is true by default and can be opted
+    out. (:pr:`11146`, :pr:`11007`, :pr:`11047`, :pr:`11012`, :pr:`10823`, :pr:`11023`)
+
+  + Also for learning to rank, a new parameter ``lambdarank_score_normalization`` is
+    introduced to make one of the normalizations optional. (:pr:`11272`)
+
+  + The ``lambdarank_normalization`` now uses the number of pairs when normalizing the
+    ``mean`` pair strategy. Previously, the gradient was used for both ``topk`` and
+    ``mean``. :pr:`11322`
+
+- We have improved GPU quantile sketching to reduce memory usage. The improvement helps
+  the construction of the :py:class:`~xgboost.QuantileDMatrix` and the new
+  :py:class:`~xgboost.ExtMemQuantileDMatrix`.
+
+  + A new multi-level sketching algorithm is employed to reduce the overall memory usage
+    with batched inputs.
+  + In addition to algorithmic changes, internal memory usage estimation and the quantile
+    container is also updated. (:pr:`10761`, :pr:`10843`)
+  + The change introduces two more parameters for the :py:class:`~xgboost.QuantileDMatrix`
+    and :py:class:`~xgboost.DataIter`, namely, ``max_quantile_batches`` and
+    ``min_cache_page_bytes``.
+
+- More work is needed to improve the support of categorical features. This release
+  supports plotting trees with stat for categorical nodes (:pr:`11053`). In addition, some
+  preparation work is ongoing for auto re-coding categories. (:pr:`11094`, :pr:`11114`,
+  :pr:`11089`) These are feature enhancements instead of blocking issues.
+- Implement weight-based feature importance for vector-leaf. (:pr:`10700`)
+- Reduced logging in the DMatrix construction. (:pr:`11080`)
+
+.. _3_0_optimization:
+
+************
+Optimization
+************
+
+In addition to the external memory and quantile sketching improvements, we have a number
+of optimizations and performance fixes.
+
+- GPU tree methods now use significantly less memory for both dense inputs and near-dense
+  inputs. (:pr:`10821`, :pr:`10870`)
+- For near-dense inputs, GPU training is much faster for both ``hist`` (about 2x) and
+  ``approx``.
+- Quantile regression on CPU now can handle imbalance trees much more efficiently. (:pr:`11275`)
+- Small optimization for DMatrix construction to reduce latency. Also, C users can now
+  reuse the :cpp:func:`ProxyDMatrix <XGProxyDMatrixCreate()>` for multiple inference
+  calls. (:pr:`11273`)
+- CPU prediction performance for :py:class:`~xgboost.QuantileDMatrix` has been improved
+  (:pr:`11139`) and now is on par with normal ``DMatrix``.
+- Fixed a performance issue for running inference using CPU with extremely sparse
+  :py:class:`~xgboost.QuantileDMatrix` (:pr:`11250`).
+- Optimize CPU training memory allocation for improved performance. (:pr:`11112`)
+- Improved RMM (rapids memory manager) integration. Now, with the help of
+  :py:func:`~xgboost.config_context`, all memory allocated by XGBoost should be routed to
+  RMM. As a bonus, all ``thrust`` algorithms now use async policy. (:pr:`10873`, :pr:`11173`, :pr:`10712`,
+  :pr:`10712`, :pr:`10562`)
+- When used without RMM, XGBoost is more careful with its use of caching allocator to
+  avoid holding too much device memory. (:pr:`10582`)
+
+****************
+Breaking Changes
+****************
+This section lists breaking changes that affect all packages.
+
+- Remove the deprecated ``DeviceQuantileDMatrix``. (:pr:`10974`, :pr:`10491`)
+- Support for saving the model in the ``deprecated`` has been removed. Users can still
+  load old models in 3.0. (:pr:`10490`)
+- Support for the legacy (blocking) CUDA stream is removed (:pr:`10607`)
+
+*********
+Bug Fixes
+*********
+- Fix the quantile error metric (pinball loss) with multiple quantiles. (:pr:`11279`)
+- Fix potential access error when running prediction in multi-thread environment. (:pr:`11167`)
+- Check the correct dump format for the ``gblinear``. (:pr:`10831`)
+
+*************
+Documentation
+*************
+- A new tutorial for advanced usage with custom objective functions. (:pr:`10283`, :pr:`10725`)
+- The new online document site now shows documents for all packages including Python, R,
+  and JVM-based packages. (:pr:`11240`, :pr:`11216`, :pr:`11166`)
+- Lots of enhancements. (:pr:`10822`, 11137, :pr:`11138`, :pr:`11246`, :pr:`11266`, :pr:`11253`, :pr:`10731`, :pr:`11222`,
+  :pr:`10551`, :pr:`10533`)
+- Consistent use of cmake in documents. (:pr:`10717`)
+- Add a brief description for using the ``offset`` from the GLM setting (like
+  ``Poisson``). (:pr:`10996`)
+- Cleanup document for building from source. (:pr:`11145`)
+- Various fixes. (:pr:`10412`, :pr:`10405`, :pr:`10353`, :pr:`10464`, :pr:`10587`, :pr:`10350`, :pr:`11131`, :pr:`10815`)
+- Maintenance. (:pr:`11052`, :pr:`10380`)
+
+**************
+Python Package
+**************
+
+- The ``feature_weights`` parameter in the sklearn interface is now defined as
+  a scikit-learn parameter. (:pr:`9506`)
+- Initial support for polars, categorical feature is not yet supported. (:pr:`11126`, :pr:`11172`,
+  :pr:`11116`)
+- Reduce pandas dataframe overhead and overhead for various imports. (:pr:`11058`, :pr:`11068`)
+- Better xlabel in :py:func:`~xgboost.plot_importance` (:pr:`11009`)
+- Validate reference dataset for training. The :py:func:`~xgboost.train` function now
+  throws an error if a :py:class:`~xgboost.QuantileDMatrix` is used as a validation
+  dataset without a reference. (:pr:`11105`)
+- Fix misleading errors when feature names are missing during inference (:pr:`10814`)
+- Add Stacklevel to Python warning callback. The change helps improve the error message
+  for the Python package. (:pr:`10977`)
+- Remove circular reference in DataIter. It helps reduce memory usage. (:pr:`11177`)
+- Add checks for invalid inputs for `cv`. (:pr:`11255`)
+- Update Python project classifiers. (:pr:`10381`, :pr:`11028`)
+- Support doc link for the sklearn module. Users can now find links to documents in a
+  jupyter notebook. (:pr:`10287`)
+
+- Dask
+
+  + Prevent the training from hanging due to aborted workers. (:pr:`10985`) This helps
+    Dask XGBoost be robust against error. When a worker is killed, the training will fail
+    with an exception instead of hang.
+  + Optional support for client-side logging. (:pr:`10942`)
+  + Fix LTR with empty partition and NCCL error. (:pr:`11152`)
+  + Update to work with the latest Dask. (:pr:`11291`)
+  + See the :ref:`3_0_features` section for changes to ranking models.
+  + See the :ref:`3_0_networking` section for changes with the communication module.
+
+- PySpark
+
+  + Expose Training and Validation Metrics. (:pr:`11133`)
+  + Add barrier before initializing the communicator. (:pr:`10938`)
+  + Extend support for columnar input to CPU (GPU-only previously). (:pr:`11299`)
+  + See the :ref:`3_0_features` section for changes to ranking models.
+  + See the :ref:`3_0_networking` section for changes with the communication module.
+
+- Document updates (:pr:`11265`).
+- Maintenance. (:pr:`11071`, :pr:`11211`, :pr:`10837`, :pr:`10754`, :pr:`10347`, :pr:`10678`, :pr:`11002`, :pr:`10692`, :pr:`11006`,
+  :pr:`10972`, :pr:`10907`, :pr:`10659`, :pr:`10358`, :pr:`11149`, :pr:`11178`, :pr:`11248`)
+
+- Breaking changes
+
+  + Remove deprecated `feval`. (:pr:`11051`)
+  + Remove dask from the default import. (:pr:`10935`) Users are now required to import the
+    XGBoost Dask through:
+
+    .. code-block:: python
+
+       from xgboost import dask as dxgb
+
+    instead of:
+
+    .. code-block:: python
+
+       import xgboost as xgb
+       xgb.dask
+
+    The change helps avoid introducing dask into the default import set.
+
+  + Bump Python requirement to 3.10. (:pr:`10434`)
+  + Drop support for datatable. (:pr:`11070`)
+
+*********
+R Package
+*********
+
+We have been reworking the R package for a few releases now. In 3.0, we will start
+publishing a new R package on public repositories, likely R-universe, before moving toward
+a CRAN update. The new package features a much more ergonomic interface, which is also
+more idiomatic to R speakers. In addition, a range of new features are introduced to the
+package. To name a few, the new package includes categorical feature support,
+``QuantileDMatrix``, and an initial implementation of the external memory training.
+
+Also, we finally have an online documentation site for the R package featuring both
+vignettes and API references (:pr:`11166`, :pr:`11257`). A good starting point for the new interface
+is the new ``xgboost()`` function. We won't list all the feature gains here, as there are
+too many! Please visit the :doc:`/R-package/index` for more info. There's a migration
+guide (:pr:`11197`) there if you use a previous XGBoost R package version.
+
+- Support for the MSVC build was dropped due to incompatibility with R headers. (:pr:`10355`,
+  :pr:`11150`)
+- Maintenance (:pr:`11259`)
+- Related PRs. (:pr:`11171`, :pr:`11231`, :pr:`11223`, :pr:`11073`, :pr:`11224`, :pr:`11076`, :pr:`11084`, :pr:`11081`,
+  :pr:`11072`, :pr:`11170`, :pr:`11123`, :pr:`11168`, :pr:`11264`, :pr:`11140`, :pr:`11117`, :pr:`11104`, :pr:`11095`, :pr:`11125`, :pr:`11124`,
+  :pr:`11122`, :pr:`11108`, :pr:`11102`, :pr:`11101`, :pr:`11100`, :pr:`11077`, :pr:`11099`, :pr:`11074`, :pr:`11065`, :pr:`11092`, :pr:`11090`,
+  :pr:`11096`, :pr:`11148`, :pr:`11151`, :pr:`11159`, :pr:`11204`, :pr:`11254`, :pr:`11109`, :pr:`11141`, :pr:`10798`, :pr:`10743`, :pr:`10849`,
+  :pr:`10747`, :pr:`11022`, :pr:`10989`, :pr:`11026`, :pr:`11060`, :pr:`11059`, :pr:`11041`, :pr:`11043`, :pr:`11025`, :pr:`10674`, :pr:`10727`,
+  :pr:`10745`, :pr:`10733`, :pr:`10750`, :pr:`10749`, :pr:`10744`, :pr:`10794`, :pr:`10330`, :pr:`10698`, :pr:`10687`, :pr:`10688`, :pr:`10654`,
+  :pr:`10456`, :pr:`10556`, :pr:`10465`, :pr:`10337`)
+
+************
+JVM Packages
+************
+
+The XGBoost 3.0 release features a significant update to the JVM packages, and in
+particular, the Spark package. There are breaking changes in packaging and some
+parameters. Please visit the :doc:`migration guide </jvm/xgboost_spark_migration>` for
+related changes. The work brings new features and a more unified feature set between CPU
+and GPU implementation. (:pr:`10639`, :pr:`10833`, :pr:`10845`, :pr:`10847`, :pr:`10635`, :pr:`10630`, :pr:`11179`, :pr:`11184`)
+
+- Automatic partitioning for distributed learning to rank. See the :ref:`features
+  <3_0_features>` section above (:pr:`11023`).
+- Resolve spark compatibility issue (:pr:`10917`)
+- Support missing value when constructing dmatrix with iterator (:pr:`10628`)
+- Fix transform performance issue (:pr:`10925`)
+- Honor skip.native.build option in xgboost4j-gpu (:pr:`10496`)
+- Support array features type for CPU (:pr:`10937`)
+- Change default missing value to ``NaN`` for better alignment (:pr:`11225`)
+- Don't cast to float if it's already float (:pr:`10386`)
+- Maintenance. (:pr:`10982`, :pr:`10979`, :pr:`10978`, :pr:`10673`, :pr:`10660`, :pr:`10835`, :pr:`10836`, :pr:`10857`, :pr:`10618`,
+  :pr:`10627`)
+
+***********
+Maintenance
+***********
+
+Code maintenance includes both refactoring (:pr:`10531`, :pr:`10573`, :pr:`11069`), cleanups (:pr:`11129`,
+:pr:`10878`, :pr:`11244`, :pr:`10401`, :pr:`10502`, :pr:`11107`, :pr:`11097`, :pr:`11130`, :pr:`10758`, :pr:`10923`, :pr:`10541`, :pr:`10990`),
+and improvements for tests (:pr:`10611`, :pr:`10658`, :pr:`10583`, :pr:`11245`, :pr:`10708`), along with fixing
+various warnings in compilers and test dependencies (:pr:`10757`, :pr:`10641`, :pr:`11062`,
+:pr:`11226`). Also, miscellaneous updates, including some dev scripts and profiling annotations
+(:pr:`10485`, :pr:`10657`, :pr:`10854`, :pr:`10718`, :pr:`11158`, :pr:`10697`, :pr:`11276`).
+
+Lastly, dependency updates (:pr:`10362`, :pr:`10363`, :pr:`10360`, :pr:`10373`, :pr:`10377`, :pr:`10368`, :pr:`10369`,
+:pr:`10366`, :pr:`11032`, :pr:`11037`, :pr:`11036`, :pr:`11035`, :pr:`11034`, :pr:`10518`, :pr:`10536`, :pr:`10586`, :pr:`10585`, :pr:`10458`,
+:pr:`10547`, :pr:`10429`, :pr:`10517`, :pr:`10497`, :pr:`10588`, :pr:`10975`, :pr:`10971`, :pr:`10970`, :pr:`10949`, :pr:`10947`, :pr:`10863`,
+:pr:`10953`, :pr:`10954`, :pr:`10951`, :pr:`10590`, :pr:`10600`, :pr:`10599`, :pr:`10535`, :pr:`10516`, :pr:`10786`, :pr:`10859`, :pr:`10785`,
+:pr:`10779`, :pr:`10790`, :pr:`10777`, :pr:`10855`, :pr:`10848`, :pr:`10778`, :pr:`10772`, :pr:`10771`, :pr:`10862`, :pr:`10952`, :pr:`10768`,
+:pr:`10770`, :pr:`10769`, :pr:`10664`, :pr:`10663`, :pr:`10892`, :pr:`10979`, :pr:`10978`).
+
+***
+CI
+***
+
+- The CI is reworked to use `RunsOn` to integrate custom CI pipelines with GitHub
+  action. The migration helps us reduce the maintenance burden and make the CI
+  configuration more accessible to others. (:pr:`11001`, :pr:`11079`, :pr:`10649`, :pr:`11196`, :pr:`11055`,
+  :pr:`10483`, :pr:`11078`, :pr:`11157`)
+
+- Other maintenance work includes various small fixes, enhancements, and tooling
+  updates. (:pr:`10877`, :pr:`10494`, :pr:`10351`, :pr:`10609`, :pr:`11192`, :pr:`11188`, :pr:`11142`, :pr:`10730`, :pr:`11066`,
+  :pr:`11063`, :pr:`10800`, :pr:`10995`, :pr:`10858`, :pr:`10685`, :pr:`10593`, :pr:`11061`)
diff --git a/doc/conf.py b/doc/conf.py
index ce6a0219ccb1..6c5c456ac9f8 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -225,7 +225,7 @@ def is_readthedocs_build():
 # General information about the project.
 project = "xgboost"
 author = "%s developers" % project
-copyright = "2022, %s" % author
+copyright = "2025, %s" % author
 github_doc_root = "https://github.com/dmlc/xgboost/tree/master/doc/"
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -238,6 +238,7 @@ def is_readthedocs_build():
     "sphinx.ext.mathjax",
     "sphinx.ext.intersphinx",
     "sphinx_gallery.gen_gallery",
+    "sphinx_issues",
     "breathe",
     "myst_parser",
 ]
@@ -262,6 +263,10 @@ def is_readthedocs_build():
     "matplotlib_animations": True,
 }
 
+# Sphinx-issues configuration
+# Path to GitHub repo {group}/{project}  (note that `group` is the GitHub user or organization)
+issues_github_path = "dmlc/xgboost"
+
 autodoc_typehints = "description"
 
 graphviz_output_format = "png"
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 9a2097035228..d73e5bdf2b84 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -11,6 +11,7 @@ scipy
 myst-parser
 ray[train]
 sphinx-gallery
+sphinx-issues
 dask
 pyspark
 cloudpickle
diff --git a/ops/script/changelog.py b/ops/script/changelog.py
new file mode 100644
index 000000000000..552a82f2e49d
--- /dev/null
+++ b/ops/script/changelog.py
@@ -0,0 +1,32 @@
+"""Helper script for creating links to PRs for changelog. This should be used with the
+`sphinx-issues` extension.
+
+"""
+
+import argparse
+import os
+import re
+
+from test_utils import ROOT
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Major version of the changelog, e.g., 3.0.0 .",
+    )
+    args = parser.parse_args()
+    version = args.version
+
+    fname = os.path.join(ROOT, f"doc/changes/v{version}.rst")
+
+    with open(fname) as fd:
+        note = fd.read()
+
+    # E.g. #11285 -> :pr:`11285`.
+    regex = re.compile(r"(#)(\d+)")
+    note = re.sub(regex, r":pr:`\2`", note)
+    with open(fname, "w") as fd:
+        fd.write(note)

From 257b87ca949b3d622886e5cdbf23c66eaa9784d4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 17 Mar 2025 13:54:21 +0800
Subject: [PATCH 008/224] [EM] Fix page concatenation for validation dataset.
 (#11338)

---
 src/data/ellpack_page_source.cu               | 37 ++++++-----
 src/data/ellpack_page_source.h                |  7 ++-
 .../cpp/data/test_ellpack_page_raw_format.cu  | 61 +++++++++++++++++++
 3 files changed, 90 insertions(+), 15 deletions(-)

diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 1b839e89df15..8dbf2d3ec696 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -104,23 +104,29 @@ class EllpackHostCacheStreamImpl {
 
     this->cache_->sizes_orig.push_back(page.Impl()->MemCostBytes());
     auto orig_ptr = this->cache_->sizes_orig.size() - 1;
+    CHECK_EQ(this->cache_->pages.size(), this->cache_->on_device.size());
 
     CHECK_LT(orig_ptr, this->cache_->NumBatchesOrig());
     auto cache_idx = this->cache_->cache_mapping.at(orig_ptr);
     // Wrap up the previous page if this is a new page, or this is the last page.
     auto new_page = cache_idx == this->cache_->pages.size();
-
+    // Last page expected from the user.
     auto last_page = (orig_ptr + 1) == this->cache_->NumBatchesOrig();
-    // No page concatenation is performed. If there's page concatenation, then the number
-    // of pages in the cache must be smaller than the input number of pages.
-    bool no_concat = this->cache_->NumBatchesOrig() == this->cache_->buffer_rows.size();
+
+    bool const no_concat = this->cache_->NoConcat();
+
     // Whether the page should be cached in device. If true, then we don't need to make a
     // copy during write since the temporary page is already in device when page
     // concatenation is enabled.
-    bool to_device = this->cache_->prefer_device &&
-                     this->cache_->NumDevicePages() < this->cache_->max_num_device_pages;
-
-    auto commit_page = [&ctx](EllpackPageImpl const* old_impl) {
+    //
+    // This applies only to a new cached page. If we are concatenating this page to an
+    // existing cached page, then we should respect the existing flag obtained from the
+    // first page of the cached page.
+    bool to_device_if_new_page =
+        this->cache_->prefer_device &&
+        this->cache_->NumDevicePages() < this->cache_->max_num_device_pages;
+
+    auto commit_host_page = [](EllpackPageImpl const* old_impl) {
       CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc);
       auto new_impl = std::make_unique<EllpackPageImpl>();
       new_impl->CopyInfo(old_impl);
@@ -137,7 +143,7 @@ class EllpackHostCacheStreamImpl {
       auto new_impl = std::make_unique<EllpackPageImpl>();
       new_impl->CopyInfo(page.Impl());
 
-      if (to_device) {
+      if (to_device_if_new_page) {
         // Copy to device
         new_impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(
             page.Impl()->gidx_buffer.size());
@@ -151,15 +157,16 @@ class EllpackHostCacheStreamImpl {
 
       this->cache_->offsets.push_back(new_impl->n_rows * new_impl->info.row_stride);
       this->cache_->pages.push_back(std::move(new_impl));
+      this->cache_->on_device.push_back(to_device_if_new_page);
       return new_page;
     }
 
     if (new_page) {
       // No need to copy if it's already in device.
-      if (!this->cache_->pages.empty() && !to_device) {
+      if (!this->cache_->pages.empty() && !this->cache_->on_device.back()) {
         // Need to wrap up the previous page.
-        auto commited = commit_page(this->cache_->pages.back().get());
-        // Replace the previous page with a new page.
+        auto commited = commit_host_page(this->cache_->pages.back().get());
+        // Replace the previous page (on device) with a new page on host.
         this->cache_->pages.back() = std::move(commited);
       }
       // Push a new page
@@ -174,7 +181,9 @@ class EllpackHostCacheStreamImpl {
       auto offset = new_impl->Copy(&ctx, impl, 0);
 
       this->cache_->offsets.push_back(offset);
+
       this->cache_->pages.push_back(std::move(new_impl));
+      this->cache_->on_device.push_back(to_device_if_new_page);
     } else {
       CHECK(!this->cache_->pages.empty());
       CHECK_EQ(cache_idx, this->cache_->pages.size() - 1);
@@ -182,8 +191,8 @@ class EllpackHostCacheStreamImpl {
       auto offset = new_impl->Copy(&ctx, impl, this->cache_->offsets.back());
       this->cache_->offsets.back() += offset;
       // No need to copy if it's already in device.
-      if (last_page && !to_device) {
-        auto commited = commit_page(this->cache_->pages.back().get());
+      if (last_page && !this->cache_->on_device.back()) {
+        auto commited = commit_host_page(this->cache_->pages.back().get());
         this->cache_->pages.back() = std::move(commited);
       }
     }
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index cb921daa446f..a668c39bdef4 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
@@ -47,6 +47,7 @@ struct EllpackCacheInfo {
 // This is a memory-based cache. It can be a mixed of the device memory and the host memory.
 struct EllpackMemCache {
   std::vector<std::unique_ptr<EllpackPageImpl>> pages;
+  std::vector<bool> on_device;
   std::vector<std::size_t> offsets;
   // Size of each batch before concatenation.
   std::vector<bst_idx_t> sizes_orig;
@@ -65,6 +66,9 @@ struct EllpackMemCache {
   [[nodiscard]] std::size_t SizeBytes() const;
 
   [[nodiscard]] bool Empty() const { return this->SizeBytes() == 0; }
+  // No page concatenation is performed. If there's page concatenation, then the number of
+  // pages in the cache must be smaller than the input number of pages.
+  [[nodiscard]] bool NoConcat() const { return this->NumBatchesOrig() == this->buffer_rows.size(); }
 
   [[nodiscard]] bst_idx_t NumBatchesOrig() const { return cache_mapping.size(); }
   [[nodiscard]] EllpackPageImpl const* At(std::int32_t k) const;
@@ -187,6 +191,7 @@ class EllpackCacheStreamPolicy : public F<S> {
 
   [[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
                                                       bst_idx_t length) const;
+  std::shared_ptr<EllpackMemCache const> Share() const { return p_cache_; }
 };
 
 template <typename S, template <typename> typename F>
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 32f0bed1e016..2351089f6f4d 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -157,4 +157,65 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
 }
 
 INSTANTIATE_TEST_SUITE_P(EllpackPageRawFormat, TestEllpackPageRawFormat, ::testing::Bool());
+
+TEST(EllpackPageRawFormat, DevicePageConcat) {
+  auto ctx = MakeCUDACtx(0);
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  bst_idx_t n_features = 16, n_samples = 128;
+
+  auto test = [&](std::int32_t max_num_device_pages, std::int64_t min_cache_page_bytes) {
+    EllpackCacheInfo cinfo{param, true, max_num_device_pages,
+                           std::numeric_limits<float>::quiet_NaN()};
+    ExternalDataInfo ext_info;
+
+    ext_info.n_batches = 8;
+    ext_info.row_stride = n_features;
+    for (bst_idx_t i = 0; i < ext_info.n_batches; ++i) {
+      ext_info.base_rowids.push_back(n_samples);
+    }
+    std::partial_sum(ext_info.base_rowids.cbegin(), ext_info.base_rowids.cend(),
+                     ext_info.base_rowids.begin());
+    ext_info.accumulated_rows = n_samples * ext_info.n_batches;
+    ext_info.nnz = ext_info.accumulated_rows * n_features;
+
+    auto p_fmat = RandomDataGenerator{n_samples, n_features, 0}.Seed(0).GenerateDMatrix();
+    EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy> policy;
+
+    for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
+      auto cuts = page.Impl()->CutsShared();
+      CalcCacheMapping(&ctx, true, cuts, min_cache_page_bytes, ext_info, &cinfo);
+      [&] {
+        ASSERT_EQ(cinfo.buffer_rows.size(), 4ul);
+      }();
+      policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
+    }
+
+    auto format = policy.CreatePageFormat(param);
+
+    // write multipe pages
+    for (bst_idx_t i = 0; i < ext_info.n_batches; ++i) {
+      for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
+        auto writer = policy.CreateWriter({}, i);
+        [[maybe_unused]] auto n_bytes = format->Write(page, writer.get());
+      }
+    }
+    // check correct concatenation.
+    auto mem_cache = policy.Share();
+    return mem_cache;
+  };
+
+  {
+    auto mem_cache = test(1, n_features * n_samples);
+    ASSERT_EQ(mem_cache->on_device.size(), 4);
+    ASSERT_TRUE(mem_cache->on_device[0]);
+    ASSERT_EQ(mem_cache->NumDevicePages(), 1);
+  }
+  {
+    auto mem_cache = test(2, n_features * n_samples);
+    ASSERT_EQ(mem_cache->on_device.size(), 4);
+    ASSERT_TRUE(mem_cache->on_device[0]);
+    ASSERT_TRUE(mem_cache->on_device[1]);
+    ASSERT_EQ(mem_cache->NumDevicePages(), 2);
+  }
+}
 }  // namespace xgboost::data

From 7bd21832a82dd659357ce7179838c2d8dd2322bb Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 19 Mar 2025 07:36:04 -0700
Subject: [PATCH 009/224] [Doc] CUDA 12.0+ is now required (#11344)

---
 doc/changes/v3.0.0.rst | 1 +
 doc/gpu/index.rst      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/changes/v3.0.0.rst b/doc/changes/v3.0.0.rst
index e236ba3132f7..bc1722ad4f2a 100644
--- a/doc/changes/v3.0.0.rst
+++ b/doc/changes/v3.0.0.rst
@@ -196,6 +196,7 @@ This section lists breaking changes that affect all packages.
 - Support for saving the model in the ``deprecated`` has been removed. Users can still
   load old models in 3.0. (:pr:`10490`)
 - Support for the legacy (blocking) CUDA stream is removed (:pr:`10607`)
+- XGBoost now requires CUDA 12.0 or later.
 
 *********
 Bug Fixes
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 9603a628cb81..515939723e49 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -4,7 +4,7 @@ XGBoost GPU Support
 
 This page contains information about GPU algorithms supported in XGBoost.
 
-.. note:: CUDA 11.0, Compute Capability 5.0 required (See `this list <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_ to look up compute capability of your GPU card.)
+.. note:: CUDA 12.0, Compute Capability 5.0 required (See `this list <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_ to look up compute capability of your GPU card.)
 
 *********************************************
 CUDA Accelerated Tree Construction Algorithms

From 62bae0fcb835c44ac2e9e8b56ad62886635ffbf2 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 20 Mar 2025 01:32:10 +0800
Subject: [PATCH 010/224] Update loky to 3.5.1. (#11341)

---
 ops/conda_env/aarch64_test.yml   |  2 +-
 ops/conda_env/linux_cpu_test.yml |  2 +-
 ops/conda_env/macos_cpu_test.yml | 11 ++---------
 ops/conda_env/win64_test.yml     |  2 +-
 tests/python/test_collective.py  |  6 +++++-
 5 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/ops/conda_env/aarch64_test.yml b/ops/conda_env/aarch64_test.yml
index 14305ebbf090..d7dd13639ff3 100644
--- a/ops/conda_env/aarch64_test.yml
+++ b/ops/conda_env/aarch64_test.yml
@@ -26,7 +26,7 @@ dependencies:
 - awscli
 - numba
 - llvmlite
-- loky
+- loky>=3.5.1
 - pyarrow
 - pyspark>=3.4.0
 - cloudpickle
diff --git a/ops/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml
index e4c0b507c8e2..55bac17f2dbb 100644
--- a/ops/conda_env/linux_cpu_test.yml
+++ b/ops/conda_env/linux_cpu_test.yml
@@ -34,7 +34,7 @@ dependencies:
 - boto3
 - awscli
 - py-ubjson
-- loky
+- loky>=3.5.1
 - pyarrow
 - protobuf
 - cloudpickle
diff --git a/ops/conda_env/macos_cpu_test.yml b/ops/conda_env/macos_cpu_test.yml
index 29ff99e3504f..390abf141803 100644
--- a/ops/conda_env/macos_cpu_test.yml
+++ b/ops/conda_env/macos_cpu_test.yml
@@ -6,8 +6,6 @@ dependencies:
 - pip
 - wheel
 - pyyaml
-- cpplint
-- pylint
 - numpy
 - scipy
 - llvm-openmp
@@ -20,22 +18,17 @@ dependencies:
 - python-graphviz
 - hypothesis
 - astroid
-- sphinx
 - sh
-- recommonmark
-- mock
-- breathe
 - pytest
 - pytest-cov
+- pytest-timeout
 - python-kubernetes
 - urllib3
 - jsonschema
 - boto3
 - awscli
-- loky
+- loky>=3.5.1
 - pyarrow
-- pyspark>=3.4.0
 - cloudpickle
 - pip:
   - setuptools
-  - sphinx_rtd_theme
diff --git a/ops/conda_env/win64_test.yml b/ops/conda_env/win64_test.yml
index 32b9339e6fc0..6e87e1560c21 100644
--- a/ops/conda_env/win64_test.yml
+++ b/ops/conda_env/win64_test.yml
@@ -16,5 +16,5 @@ dependencies:
 - python-graphviz
 - pip
 - py-ubjson
-- loky
+- loky>=3.5.1
 - pyarrow
diff --git a/tests/python/test_collective.py b/tests/python/test_collective.py
index 473b38b5b742..1204c0faf8c9 100644
--- a/tests/python/test_collective.py
+++ b/tests/python/test_collective.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import pytest
-from loky import get_reusable_executor
 
 import xgboost as xgb
 from xgboost import RabitTracker, build_info, federated
@@ -25,10 +24,13 @@ def run_rabit_worker(rabit_env: dict, world_size: int) -> int:
 
 @pytest.mark.skipif(**tm.no_loky())
 def test_rabit_communicator() -> None:
+    from loky import get_reusable_executor
+
     world_size = 2
     tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size)
     tracker.start()
     workers = []
+
     with get_reusable_executor(max_workers=world_size) as pool:
         for _ in range(world_size):
             worker = pool.submit(
@@ -60,6 +62,8 @@ def run_federated_worker(port: int, world_size: int, rank: int) -> int:
 @pytest.mark.skipif(**tm.skip_win())
 @pytest.mark.skipif(**tm.no_loky())
 def test_federated_communicator() -> None:
+    from loky import get_reusable_executor
+
     if not build_info()["USE_FEDERATED"]:
         pytest.skip("XGBoost not built with federated learning enabled")
 

From dec7f5896e50191f5b5fb2314c09c2f76304a47b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 20 Mar 2025 01:58:05 +0800
Subject: [PATCH 011/224] [EM] Optimize single batch. (#11339)

---
 src/data/ellpack_page_raw_format.cu            | 4 ++--
 src/data/ellpack_page_raw_format.h             | 4 ++--
 src/data/ellpack_page_source.cu                | 6 ++++++
 src/data/ellpack_page_source.h                 | 4 ++++
 src/data/extmem_quantile_dmatrix.cu            | 8 ++++----
 tests/cpp/data/test_ellpack_page_raw_format.cu | 4 ++++
 6 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 2907174a0920..955cea2d5c88 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -75,7 +75,7 @@ template <typename T>
   return true;
 }
 
-[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
+[[nodiscard]] std::size_t EllpackPageRawFormat::Write(EllpackPage const& page,
                                                       common::AlignedFileWriteStream* fo) {
   xgboost_NVTX_FN_RANGE();
 
@@ -109,7 +109,7 @@ template <typename T>
   return true;
 }
 
-[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
+[[nodiscard]] std::size_t EllpackPageRawFormat::Write(EllpackPage const& page,
                                                       EllpackHostCacheStream* fo) const {
   xgboost_NVTX_FN_RANGE();
 
diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h
index 9be2c50cff46..eda0e1d20978 100644
--- a/src/data/ellpack_page_raw_format.h
+++ b/src/data/ellpack_page_raw_format.h
@@ -38,11 +38,11 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
         param_{std::move(param)},
         has_hmm_ats_{has_hmm_ats} {}
   [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override;
-  [[nodiscard]] std::size_t Write(const EllpackPage& page,
+  [[nodiscard]] std::size_t Write(EllpackPage const& page,
                                   common::AlignedFileWriteStream* fo) override;
 
   [[nodiscard]] bool Read(EllpackPage* page, EllpackHostCacheStream* fi) const;
-  [[nodiscard]] std::size_t Write(const EllpackPage& page, EllpackHostCacheStream* fo) const;
+  [[nodiscard]] std::size_t Write(EllpackPage const& page, EllpackHostCacheStream* fo) const;
 };
 
 #if !defined(XGBOOST_USE_CUDA)
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 8dbf2d3ec696..cd99de0d38b0 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -338,6 +338,12 @@ void CalcCacheMapping(Context const* ctx, bool is_dense,
   cinfo->cache_mapping = std::move(cache_mapping);
   cinfo->buffer_bytes = std::move(cache_bytes);
   cinfo->buffer_rows = std::move(cache_rows);
+
+  // Directly store in device if there's only one batch.
+  if (cinfo->NumBatchesCc() == 1) {
+    cinfo->prefer_device = true;
+    LOG(INFO) << "Prefer device cache as there's only 1 page.";
+  }
 }
 
 /**
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index a668c39bdef4..d8d6e139c83a 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -38,6 +38,10 @@ struct EllpackCacheInfo {
         prefer_device{prefer_device},
         max_num_device_pages{max_num_device_pages},
         missing{missing} {}
+
+  // Only effective for host-based cache.
+  // The number of batches for the concatenated cache.
+  [[nodiscard]] std::size_t NumBatchesCc() const { return this->buffer_rows.size(); }
 };
 
 // We need to decouple the storage and the view of the storage so that we can implement
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
index 533d68b2b915..a633ac984e89 100644
--- a/src/data/extmem_quantile_dmatrix.cu
+++ b/src/data/extmem_quantile_dmatrix.cu
@@ -58,14 +58,14 @@ void ExtMemQuantileDMatrix::InitFromCUDA(
   /**
    * Calculate cache info
    */
-  // Prefer device storage for validation dataset since we can't hide it's data load
-  // overhead with inference. But the training procedures can confortably overlap with the
-  // data transfer.
+  // Prefer device storage for validation dataset since we can't hide the data loading
+  // overhead with inference. On the other hand, training procedures can confortably
+  // overlap with the data transfer.
   auto cinfo = EllpackCacheInfo{p, (ref != nullptr), config.max_num_device_pages, config.missing};
   CalcCacheMapping(ctx, this->info_.IsDense(), cuts,
                    DftMinCachePageBytes(config.min_cache_page_bytes), ext_info, &cinfo);
   CHECK_EQ(cinfo.cache_mapping.size(), ext_info.n_batches);
-  auto n_batches = cinfo.buffer_rows.size();  // The number of batches after page concatenation.
+  auto n_batches = cinfo.NumBatchesCc();
   LOG(INFO) << "Number of batches after concatenation:" << n_batches;
 
   /**
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 2351089f6f4d..216736e05f55 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -24,6 +24,10 @@ namespace {
 
   CalcCacheMapping(ctx, Xy->IsDense(), cuts, 0, ext_info, &cinfo);
   CHECK_EQ(ext_info.n_batches, cinfo.cache_mapping.size());
+  if (cinfo.NumBatchesCc() == 1) {
+    EXPECT_TRUE(cinfo.prefer_device);
+    cinfo.prefer_device = false;  // We test the host cache.
+  }
   return cinfo;
 }
 

From 66d83eed03cd7bd7610dabd82ceb1a25259ca715 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 20 Mar 2025 17:41:54 +0800
Subject: [PATCH 012/224] [CI] Remove nccl RAS workaround. (#11349)

---
 ops/docker_run.py                      | 1 -
 ops/pipeline/test-python-wheel-impl.sh | 1 -
 2 files changed, 2 deletions(-)

diff --git a/ops/docker_run.py b/ops/docker_run.py
index ba6c8e8c98c0..949f7fb7807d 100644
--- a/ops/docker_run.py
+++ b/ops/docker_run.py
@@ -70,7 +70,6 @@ def docker_run(
     docker_run_cli_args.extend(
         itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()])
     )
-    docker_run_cli_args.extend(["-e", "NCCL_RAS_ENABLE=0"])
     docker_run_cli_args.extend(extra_args)
     docker_run_cli_args.append(image_uri)
     docker_run_cli_args.extend(command_args)
diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh
index 4620e6ebf7fc..5c24e31210d2 100755
--- a/ops/pipeline/test-python-wheel-impl.sh
+++ b/ops/pipeline/test-python-wheel-impl.sh
@@ -45,7 +45,6 @@ case "$suite" in
   mgpu)
     echo "-- Run Python tests, using multiple GPUs"
     python -c 'from cupy.cuda import jitify; jitify._init_module()'
-    export NCCL_RAS_ENABLE=0
     pytest -v -s -rxXs --durations=0 -m 'mgpu' tests/python-gpu
     pytest -v -s -rxXs --durations=0 tests/test_distributed/test_gpu_with_dask
     pytest -v -s -rxXs --durations=0 tests/test_distributed/test_gpu_with_spark

From 8536af5f5df1a82a4e9b924c68bc386dda591a5d Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Thu, 20 Mar 2025 08:27:57 -0700
Subject: [PATCH 013/224] Use RMM's pached CCCL (#11351)

Make sure to search for RMM if it will be used. This should pick up the
patched CCCL from RMM.

If RMM is not being used and this is a CUDA build, search for CCCL
explicitly.
---
 CMakeLists.txt | 52 +++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 845347ea1ad6..ee18a2afdf96 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -229,30 +229,6 @@ if(USE_CUDA)
   endif()
 
   find_package(CUDAToolkit REQUIRED)
-  find_package(CCCL CONFIG)
-  if(CCCL_FOUND)
-    message(STATUS "Standalone CCCL found.")
-  else()
-    message(STATUS "Standalone CCCL not found. Attempting to use CCCL from CUDA Toolkit...")
-    find_package(CCCL CONFIG
-      HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
-    if(NOT CCCL_FOUND)
-      message(STATUS "Could not locate CCCL from CUDA Toolkit. Using Thrust and CUB from CUDA Toolkit...")
-      find_package(libcudacxx CONFIG REQUIRED
-        HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
-      find_package(CUB CONFIG REQUIRED
-        HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
-      find_package(Thrust CONFIG REQUIRED
-        HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
-      thrust_create_target(Thrust HOST CPP DEVICE CUDA)
-      add_library(CCCL::CCCL INTERFACE IMPORTED GLOBAL)
-      target_link_libraries(CCCL::CCCL INTERFACE libcudacxx::libcudacxx CUB::CUB Thrust)
-    endif()
-  endif()
-  # Define guard macros to prevent windows.h from conflicting with winsock2.h
-  if(WIN32)
-    target_compile_definitions(CCCL::CCCL INTERFACE NOMINMAX WIN32_LEAN_AND_MEAN _WINSOCKAPI_)
-  endif()
 endif()
 
 if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
@@ -338,6 +314,34 @@ if(PLUGIN_RMM)
   list(REMOVE_ITEM rmm_link_libs CUDA::cudart)
   list(APPEND rmm_link_libs CUDA::cudart_static)
   set_target_properties(rmm::rmm PROPERTIES INTERFACE_LINK_LIBRARIES "${rmm_link_libs}")
+
+  # Pick up patched CCCL from RMM
+elseif(USE_CUDA)
+  # If using CUDA and not RMM, search for CCCL.
+  find_package(CCCL CONFIG)
+  if(CCCL_FOUND)
+    message(STATUS "Standalone CCCL found.")
+  else()
+    message(STATUS "Standalone CCCL not found. Attempting to use CCCL from CUDA Toolkit...")
+    find_package(CCCL CONFIG
+      HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
+    if(NOT CCCL_FOUND)
+      message(STATUS "Could not locate CCCL from CUDA Toolkit. Using Thrust and CUB from CUDA Toolkit...")
+      find_package(libcudacxx CONFIG REQUIRED
+        HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
+      find_package(CUB CONFIG REQUIRED
+        HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
+      find_package(Thrust CONFIG REQUIRED
+        HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake)
+      thrust_create_target(Thrust HOST CPP DEVICE CUDA)
+      add_library(CCCL::CCCL INTERFACE IMPORTED GLOBAL)
+      target_link_libraries(CCCL::CCCL INTERFACE libcudacxx::libcudacxx CUB::CUB Thrust)
+    endif()
+  endif()
+  # Define guard macros to prevent windows.h from conflicting with winsock2.h
+  if(WIN32)
+    target_compile_definitions(CCCL::CCCL INTERFACE NOMINMAX WIN32_LEAN_AND_MEAN _WINSOCKAPI_)
+  endif()
 endif()
 
 if(PLUGIN_SYCL)

From 0500992cccd1a695fe0030184fb84d0a5f3d703b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 22 Mar 2025 14:14:31 +0800
Subject: [PATCH 014/224] Implement ordinal recoder for the GPU predictor.
 (#11347)

- Unify the code path for various GPU prediction functions.
- Implement re-coding for the GPU predictor.
---
 include/xgboost/c_api.h                   |  44 +-
 python-package/xgboost/core.py            |  12 +-
 python-package/xgboost/testing/ordinal.py | 120 ++++-
 src/data/cat_container.cuh                |  11 +-
 src/data/iterative_dmatrix.cc             |   4 +-
 src/encoder/ordinal.cuh                   |  41 +-
 src/encoder/ordinal.h                     |  25 +-
 src/predictor/cpu_predictor.cc            |   9 +-
 src/predictor/gpu_predictor.cu            | 560 ++++++++++++++--------
 tests/cpp/encoder/test_ordinal.cc         |  17 +-
 tests/cpp/encoder/test_ordinal.h          |  12 +
 tests/python-gpu/test_gpu_linear.py       |   9 +-
 tests/python-gpu/test_gpu_ordinal.py      |  84 ++++
 tests/python/test_ordinal.py              |   5 +
 14 files changed, 723 insertions(+), 230 deletions(-)

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 852f65d38f52..b268e84f4ab4 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -162,7 +162,49 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic
  * @brief Create a DMatrix from columnar data. (table)
  *
  * A special type of input to the `DMatrix` is the columnar format, which refers to
- * column-based dataframes based on the arrow formatt.
+ * column-based dataframes. XGBoost can accept both numeric data types like integers and
+ * floats, along with the categorical type, called dictionary in arrow's term. The
+ * addition of categorical type is introduced in 3.1.0. The dataframe is represented by a
+ * list array interfaces with one object for each column.
+ *
+ * A categorical type is represented by 3 buffers, the validity mask, the names of the
+ * categories (called index for most of the dataframe implementation), and the codes used
+ * to represent the categories in the rows. XGBoost consumes a categorical column by
+ * accepting two JSON-encoded arrow arrays in a list. The first item in the list is a JSON
+ * object with `{"offsets": IntegerArray, "values": StringArray }` representing the string
+ * names defined by the arrow columnar format. The second buffer is an masked integer
+ * array that stores the categorical codes along with the validity mask:
+ *
+ * @code{javascript}
+ * [
+ *   // categorical column, represented as an array (list)
+ *   [
+ *     {
+ *       'offsets':
+ *       {
+ *         'data': (129412626415808, True),
+ *         'typestr': '<i4', 'version': 3, 'strides': None, 'shape': (3,), 'mask': None
+ *       },
+ *       'values':
+ *       {
+ *         'data': (129412626416000, True),
+ *         'typestr': '<i1', 'version': 3, 'strides': None, 'shape': (7,), 'mask': None
+ *       }
+ *     },
+ *     {
+ *       'data': (106200854378448, True),
+ *       'typestr': '<i1', 'version': 3, 'strides': None, 'shape': (2,), 'mask': None
+ *     }
+ *   ],
+ *   // numeric column, represented as an object, same number of rows as the previous column (2)
+ *   {
+ *     'data': (106200854378448, True),
+ *     'typestr': '<f4', 'version': 3, 'strides': None, 'shape': (2,), 'mask': None
+ *   }
+ * ]
+ * @endcode
+ *
+ * As for numeric inputs, it's the same as dense array.
  *
  * @param data   A list of JSON-encoded array interfaces.
  * @param config See @ref XGDMatrixCreateFromDense for details.
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 2d058365c7ad..6b6b12e91983 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -891,10 +891,11 @@ def __init__(
 
             .. note:: This parameter is experimental
 
-            Experimental support of specializing for categorical features.
+            Experimental support of specializing for categorical features. See
+            :doc:`/tutorials/categorical` for more info.
 
-            If passing `True` and `data` is a data frame (from supported libraries such as
-            Pandas, Modin or cuDF), The DMatrix recognizes categorical columns and
+            If passing `True` and `data` is a data frame (from supported libraries such
+            as Pandas, Modin or cuDF), The DMatrix recognizes categorical columns and
             automatically set the `feature_types` parameter. If `data` is not a data
             frame, this argument is ignored.
 
@@ -904,7 +905,10 @@ def __init__(
             See notes in the :py:class:`DataIter` for consistency requirement when the
             input is an iterator.
 
-            JSON/UBJSON serialization format is required for this.
+            .. versionchanged:: 3.1.0
+
+            XGBoost can remember the encoding of categories when the input is a
+            dataframe.
 
         """
         if group is not None and qid is not None:
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 0d0ab6c21dfb..69893ab43fef 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -31,6 +31,15 @@ def get_df_impl(device: str) -> Tuple[Type, Type]:
     return Df, Ser
 
 
+def asarray(device: str, data: Any) -> np.ndarray:
+    """Wrapper to get an array."""
+    if device == "cpu":
+        return np.asarray(data)
+    import cupy as cp
+
+    return cp.asarray(data)
+
+
 def assert_allclose(device: str, a: Any, b: Any) -> None:
     """Dispatch the assert_allclose for devices."""
     if device == "cpu":
@@ -273,12 +282,12 @@ def run_mixed(DMatrixT: Type) -> None:
 
         # used with the next df
         b_codes = df.b.cat.codes
-        np.testing.assert_allclose(np.asarray(b_codes), np.array([1, 0, 2]))
+        assert_allclose(device, asarray(device, b_codes), np.array([1, 0, 2]))
         # pick codes of 3, 1
         b_encoded = np.array([b_codes.iloc[2], b_codes.iloc[1]])
 
         c_codes = df.c.cat.codes
-        np.testing.assert_allclose(np.asarray(c_codes), np.array([1, 0, 2]))
+        assert_allclose(device, asarray(device, c_codes), np.array([1, 0, 2]))
         # pick codes of "def", "abc"
         c_encoded = np.array([c_codes.iloc[2], c_codes.iloc[1]])
         encoded = np.stack([b_encoded, c_encoded], axis=1)
@@ -317,13 +326,19 @@ def run_invalid(DMatrixT: Type) -> None:
         with pytest.raises(ValueError, match="The data type doesn't match"):
             booster.predict(Xy)
 
+        df = Df(
+            {"b": [2, 1, 3, 4], "c": ["cdef", "abc", "def", "bbc"]}, dtype="category"
+        )
+        with pytest.raises(ValueError, match="Found a category not in the training"):
+            booster.inplace_predict(df)
+
     for dm in (DMatrix, QuantileDMatrix):
         run_invalid(dm)
 
 
 def run_cat_thread_safety(device: Literal["cpu", "cuda"]) -> None:
     """Basic tests for thread safety."""
-    X, y = make_categorical(2048, 16, 112, onehot=False, cat_ratio=0.5)
+    X, y = make_categorical(2048, 16, 112, onehot=False, cat_ratio=0.5, device=device)
     Xy = QuantileDMatrix(X, y, enable_categorical=True)
     booster = train({"device": device}, Xy, num_boost_round=10)
 
@@ -412,3 +427,102 @@ def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
     _run_predt(
         device, DMatrix, pred_contribs=False, pred_interactions=False, pred_leaf=True
     )
+
+
+def run_specified_cat(  # pylint: disable=too-many-locals
+    device: Literal["cpu", "cuda"],
+) -> None:
+    """Run with manually specified category encoding."""
+    import pandas as pd
+
+    # Same between old and new, wiht 0 ("a") and 1 ("b") exchanged their position.
+    old_cats = ["a", "b", "c", "d"]
+    new_cats = ["b", "a", "c", "d"]
+    mapping = {0: 1, 1: 0}
+
+    col0 = np.arange(0, 9)
+    col1 = pd.Categorical.from_codes(
+        # b, b, c, d, a, c, c, d, a
+        categories=old_cats,
+        codes=[1, 1, 2, 3, 0, 2, 2, 3, 0],
+    )
+    df = pd.DataFrame({"f0": col0, "f1": col1})
+    Df, _ = get_df_impl(device)
+    df = Df(df)
+    rng = np.random.default_rng(2025)
+    y = rng.uniform(size=df.shape[0])
+
+    for dm in (DMatrix, QuantileDMatrix):
+        Xy = dm(df, y, enable_categorical=True)
+        booster = train({"device": device}, Xy)
+        predt0 = booster.predict(Xy)
+        predt1 = booster.inplace_predict(df)
+        assert_allclose(device, predt0, predt1)
+
+        col1 = pd.Categorical.from_codes(
+            # b, b, c, d, a, c, c, d, a
+            categories=new_cats,
+            codes=[0, 0, 2, 3, 1, 2, 2, 3, 1],
+        )
+        df1 = Df({"f0": col0, "f1": col1})
+        predt2 = booster.inplace_predict(df1)
+        assert_allclose(device, predt0, predt2)
+
+    # Test large column numbers. XGBoost makes some specializations for slim datasets,
+    # make sure we cover all the cases.
+    n_features = 4096
+    n_samples = 1024
+    df = pd.DataFrame()
+    col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
+    col_categorical = rng.integers(
+        low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
+    )
+
+    for c in range(n_features):
+        if c % 2 == 0:
+            col = col_numeric[:, c // 2]
+        else:
+            codes = col_categorical[:, c // 2]
+            col = pd.Categorical.from_codes(
+                categories=old_cats,
+                codes=codes,
+            )
+        df[f"f{c}"] = col
+
+    df = Df(df)
+    y = rng.normal(size=n_samples)
+
+    Xy = DMatrix(df, y, enable_categorical=True)
+    booster = train({"device": device}, Xy)
+
+    predt0 = booster.predict(Xy)
+    predt1 = booster.inplace_predict(df)
+    assert_allclose(device, predt0, predt1)
+
+    for c in range(n_features):
+        if c % 2 == 0:
+            continue
+
+        name = f"f{c}"
+        codes_ser = df[name].cat.codes
+        if hasattr(codes_ser, "to_pandas"):  # cudf
+            codes_ser = codes_ser.to_pandas()
+        new_codes = codes_ser.replace(mapping)
+        df[name] = pd.Categorical.from_codes(categories=new_cats, codes=new_codes)
+
+    df = Df(df)
+    Xy = DMatrix(df, y, enable_categorical=True)
+    predt2 = booster.predict(Xy)
+    assert_allclose(device, predt0, predt2)
+
+    array = np.empty(shape=(n_samples, n_features))
+    array[:, np.arange(0, n_features) % 2 == 0] = col_numeric
+    array[:, np.arange(0, n_features) % 2 != 0] = col_categorical
+
+    if device == "cuda":
+        import cupy as cp
+
+        array = cp.array(array)
+
+    predt3 = booster.inplace_predict(array)
+    assert_allclose(device, predt0, predt3)
diff --git a/src/data/cat_container.cuh b/src/data/cat_container.cuh
index 8cfbf6ee16e1..9522a97c856a 100644
--- a/src/data/cat_container.cuh
+++ b/src/data/cat_container.cuh
@@ -60,13 +60,12 @@ struct EncThrustPolicy {
   template <typename T>
   using ThrustAllocator = dh::XGBDeviceAllocator<T>;
 
-  auto ThrustPolicy() const {
-#if defined(XGBOOST_USE_RMM)
-    return rmm::exec_policy_nosync{};
-#else
-    return dh::CachingThrustPolicy();
-#endif  // defined(XGBOOST_USE_RMM)
+  [[nodiscard]] auto ThrustPolicy() const {
+    dh::XGBCachingDeviceAllocator<char> alloc;
+    auto exec = thrust::cuda::par_nosync(alloc).on(dh::DefaultStream());
+    return exec;
   }
+  [[nodiscard]] auto Stream() const { return dh::DefaultStream(); }
 };
 
 using EncPolicyT = enc::Policy<EncErrorPolicy, EncThrustPolicy>;
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 3f59af9ffda2..2d6f7451d43d 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -51,7 +51,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   this->batch_ = p;
 
   LOG(INFO) << "Finished constructing the `IterativeDMatrix`: (" << this->Info().num_row_ << ", "
-            << this->Info().num_col_ << ", " << this->Info().num_nonzero_ << ").";
+            << this->Info().num_col_ << ", " << this->info_.num_nonzero_ << ").";
 }
 
 void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
@@ -110,7 +110,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     accumulated_rows += BatchSamples(proxy);
   }
   iter.Reset();
-  CHECK_EQ(accumulated_rows, Info().num_row_);
+  CHECK_EQ(accumulated_rows, this->info_.num_row_);
 
   if (ext_info.n_batches == 1) {
     this->info_ = std::move(proxy->Info());
diff --git a/src/encoder/ordinal.cuh b/src/encoder/ordinal.cuh
index 282441d4a0d3..42300e4c38ef 100644
--- a/src/encoder/ordinal.cuh
+++ b/src/encoder/ordinal.cuh
@@ -98,8 +98,8 @@ struct SegmentedSearchSortedNumOp {
         haystack_v.feature_segments[f_idx + 1] - haystack_v.feature_segments[f_idx]);
     auto end_it = it + f_sorted_idx.size();
     auto ret_it = thrust::lower_bound(thrust::seq, it, end_it, SearchKey(), [&](auto l, auto r) {
-      T l_value = l == SearchKey() ? needle : haystack[ref_sorted_idx[l]];
-      T r_value = r == SearchKey() ? needle : haystack[ref_sorted_idx[r]];
+      T l_value = l == SearchKey() ? needle : haystack[f_sorted_idx[l]];
+      T r_value = r == SearchKey() ? needle : haystack[f_sorted_idx[r]];
       return l_value < r_value;
     });
     if (ret_it == it + f_sorted_idx.size()) {
@@ -122,7 +122,8 @@ struct DftThrustPolicy {
   template <typename T>
   using ThrustAllocator = thrust::device_allocator<T>;
 
-  auto ThrustPolicy() const { return thrust::cuda::par_nosync; }
+  [[nodiscard]] auto ThrustPolicy() const { return thrust::cuda::par_nosync; }
+  [[nodiscard]] auto Stream() const { return cudaStreamPerThread; }
 };
 }  // namespace cuda_impl
 
@@ -144,12 +145,15 @@ using DftDevicePolicy = Policy<cuda_impl::DftThrustPolicy, detail::DftErrorHandl
 template <typename ExecPolicy>
 void SortNames(ExecPolicy const& policy, DeviceColumnsView orig_enc,
                Span<std::int32_t> sorted_idx) {
+  typename ExecPolicy::template ThrustAllocator<char> alloc;
+  auto exec = thrust::cuda::par_nosync(alloc).on(policy.Stream());
+
   auto n_total_cats = orig_enc.n_total_cats;
   if (static_cast<std::int32_t>(sorted_idx.size()) != orig_enc.n_total_cats) {
     policy.Error("`sorted_idx` should have the same size as `n_total_cats`.");
   }
   auto d_sorted_idx = dh::ToSpan(sorted_idx);
-  cuda_impl::SegmentedIota(policy.ThrustPolicy(), orig_enc.feature_segments, d_sorted_idx);
+  cuda_impl::SegmentedIota(exec, orig_enc.feature_segments, d_sorted_idx);
 
   // <fidx, sorted_idx>
   using Pair = cuda::std::pair<std::int32_t, std::int32_t>;
@@ -162,9 +166,9 @@ void SortNames(ExecPolicy const& policy, DeviceColumnsView orig_enc,
         auto idx = d_sorted_idx[i];
         return cuda::std::make_pair(static_cast<std::int32_t>(seg), idx);
       }));
-  thrust::copy(policy.ThrustPolicy(), key_it, key_it + n_total_cats, keys.begin());
+  thrust::copy(exec, key_it, key_it + n_total_cats, keys.begin());
 
-  thrust::sort(policy.ThrustPolicy(), keys.begin(), keys.end(),
+  thrust::sort(exec, keys.begin(), keys.end(),
                cuda::proclaim_return_type<bool>([=] __device__(Pair const& l, Pair const& r) {
                  if (l.first == r.first) {  // same feature
                    auto const& col = orig_enc.columns[l.first];
@@ -193,7 +197,7 @@ void SortNames(ExecPolicy const& policy, DeviceColumnsView orig_enc,
       thrust::make_counting_iterator(0),
       cuda::proclaim_return_type<decltype(Pair{}.second)>(
           [=] __device__(std::int32_t i) { return s_keys[i].second; }));
-  thrust::copy(policy.ThrustPolicy(), it, it + sorted_idx.size(), dh::tbegin(sorted_idx));
+  thrust::copy(exec, it, it + sorted_idx.size(), dh::tbegin(sorted_idx));
 }
 
 /**
@@ -212,8 +216,27 @@ template <typename ExecPolicy>
 void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc,
             Span<std::int32_t const> sorted_idx, DeviceColumnsView new_enc,
             Span<std::int32_t> mapping) {
-  auto exec = policy.ThrustPolicy();
+  typename ExecPolicy::template ThrustAllocator<char> alloc;
+  auto exec = thrust::cuda::par_nosync(alloc).on(policy.Stream());
   detail::BasicChecks(policy, orig_enc, sorted_idx, new_enc, mapping);
+  /**
+   * Check consistency.
+   */
+  auto check_it = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
+        auto l_f = orig_enc.columns[i];
+        auto r_f = new_enc.columns[i];
+        auto l_is_empty = cuda::std::visit([](auto&& arg) { return arg.empty(); }, l_f);
+        auto r_is_empty = cuda::std::visit([](auto&& arg) { return arg.empty(); }, r_f);
+        return l_is_empty == r_is_empty;
+      });
+  bool valid = thrust::reduce(exec, check_it, check_it + new_enc.Size(), true,
+                              [=] XGBOOST_DEVICE(bool l, bool r) { return l && r; });
+  if (!valid) {
+    policy.Error(
+        "Invalid new DataFrame. "
+        "The data type doesn't match the one used in the training dataset.");
+  }
 
   /**
    * search the index for the new encoding
@@ -222,7 +245,7 @@ void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc,
       exec, thrust::make_counting_iterator(0), new_enc.n_total_cats,
       [=] __device__(std::int32_t i) {
         auto f_idx = dh::SegmentId(new_enc.feature_segments, i);
-        std::int32_t searched_idx{-1};
+        std::int32_t searched_idx{detail::NotFound()};
         auto const& col = orig_enc.columns[f_idx];
         cuda::std::visit(Overloaded{[&](CatStrArrayView const& str) {
                                       auto op = cuda_impl::SegmentedSearchSortedStrOp{
diff --git a/src/encoder/ordinal.h b/src/encoder/ordinal.h
index bfb334d29666..d4de6d0c8a59 100644
--- a/src/encoder/ordinal.h
+++ b/src/encoder/ordinal.h
@@ -107,7 +107,8 @@ using DeviceCatIndexView = cuda_impl::TupToVarT<CatIndexViewTypes>;
  * Accepted policies:
  *
  * - A class with a `ThrustPolicy` method that returns a thrust execution policy, along with a
- *   `ThrustAllocator` template type. This is only used for the GPU implementation.
+ *   `ThrustAllocator` template type. In addition, a `Stream` method that returns a CUDA stream.
+ *   This is only used for the GPU implementation.
  *
  * - An error handling policy that exposes a single `Error` method, which takes a single
  *   string parameter for error message.
@@ -133,6 +134,7 @@ struct ColumnsViewImpl {
   [[nodiscard]] std::size_t Size() const { return columns.size(); }
   [[nodiscard]] bool Empty() const { return this->Size() == 0; }
   [[nodiscard]] auto operator[](std::size_t i) const { return columns[i]; }
+  [[nodiscard]] auto HasCategorical() const { return n_total_cats != 0; }
 };
 
 struct DftErrorHandler {
@@ -418,4 +420,25 @@ inline std::ostream &operator<<(std::ostream &os, CatStrArrayView const &strings
   os << "]";
   return os;
 }
+
+inline std::ostream &operator<<(std::ostream &os, HostColumnsView const &h_enc) {
+  for (std::size_t i = 0; i < h_enc.columns.size(); ++i) {
+    auto const &col = h_enc.columns[i];
+    os << "f" << i << ": ";
+    std::visit(enc::Overloaded{[&](enc::CatStrArrayView const &str) { os << str; },
+                               [&](auto &&values) {
+                                 os << "[";
+                                 for (std::size_t j = 0, n = values.size(); j < n; ++j) {
+                                   os << values[j];
+                                   if (j != n - 1) {
+                                     os << ", ";
+                                   }
+                                 }
+                                 os << "]";
+                               }},
+               col);
+    os << std::endl;
+  }
+  return os;
+}
 }  // namespace enc
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index d986882a6795..c82ece98d83c 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -367,13 +367,14 @@ static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
   }
 }
 
-auto MakeCatAccessor(Context const *ctx, enc::HostColumnsView const &cats,
+auto MakeCatAccessor(Context const *ctx, enc::HostColumnsView const &new_enc,
                      gbm::GBTreeModel const &model) {
-  std::vector<std::int32_t> mapping(cats.n_total_cats);
+  std::vector<std::int32_t> mapping(new_enc.n_total_cats);
   auto sorted_idx = model.Cats()->RefSortedIndex(ctx);
   auto orig_enc = model.Cats()->HostView();
-  enc::Recode(cpu_impl::EncPolicy, orig_enc, sorted_idx, cats, common::Span{mapping});
-  auto cats_mapping = enc::MappingView{cats.feature_segments, mapping};
+  enc::Recode(cpu_impl::EncPolicy, orig_enc, sorted_idx, new_enc, common::Span{mapping});
+  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
+  auto cats_mapping = enc::MappingView{new_enc.feature_segments, mapping};
   auto acc = CatAccessor{cats_mapping};
   return std::tuple{acc, std::move(mapping)};
 }
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index d99f00cd35a4..f00641d9f5a7 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -16,11 +16,13 @@
 #include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs, SetDevice
 #include "../common/device_helpers.cuh"
-#include "../common/error_msg.h"  // for InplacePredictProxy
-#include "../data/batch_utils.h"  // for StaticBatch
+#include "../common/error_msg.h"      // for InplacePredictProxy
+#include "../data/batch_utils.h"      // for StaticBatch
+#include "../data/cat_container.cuh"  // for EncPolicy
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
+#include "../encoder/ordinal.cuh"  // for CudaCategoryRecoder
 #include "../gbm/gbtree_model.h"
 #include "predict_fn.h"
 #include "xgboost/data.h"
@@ -74,9 +76,8 @@ struct SparsePageView {
 
   SparsePageView() = default;
   XGBOOST_DEVICE SparsePageView(common::Span<const Entry> data,
-                                common::Span<const bst_idx_t> row_ptr,
-                                bst_feature_t num_features)
-      : d_data{data}, d_row_ptr{row_ptr}, num_features(num_features) {}
+                                common::Span<const bst_idx_t> row_ptr, bst_feature_t n_features)
+      : d_data{data}, d_row_ptr{row_ptr}, num_features(n_features) {}
   [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
     // Binary search
     auto begin_ptr = d_data.begin() + d_row_ptr[ridx];
@@ -109,14 +110,19 @@ struct SparsePageView {
   [[nodiscard]] XGBOOST_DEVICE size_t NumCols() const { return num_features; }
 };
 
+template <typename EncAccessor>
 struct SparsePageLoader {
+ private:
+  EncAccessor acc_;
+
+ public:
   bool use_shared;
   SparsePageView data;
   float* smem;
 
   __device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
-                              bst_idx_t num_rows, float)
-      : use_shared(use_shared), data(data) {
+                              bst_idx_t num_rows, float, EncAccessor&& acc)
+      : use_shared(use_shared), data(data), acc_{std::forward<EncAccessor>(acc)} {
     extern __shared__ float _smem[];
     smem = _smem;
     // Copy instances
@@ -130,7 +136,7 @@ struct SparsePageLoader {
         bst_uint elem_end = data.d_row_ptr[global_idx + 1];
         for (bst_uint elem_idx = elem_begin; elem_idx < elem_end; elem_idx++) {
           Entry elem = data.d_data[elem_idx];
-          smem[threadIdx.x * data.num_features + elem.index] = elem.fvalue;
+          smem[threadIdx.x * data.num_features + elem.index] = this->acc_(elem);
         }
       }
       __syncthreads();
@@ -140,22 +146,27 @@ struct SparsePageLoader {
     if (use_shared) {
       return smem[threadIdx.x * data.num_features + fidx];
     } else {
-      return data.GetElement(ridx, fidx);
+      return this->acc_(data.GetElement(ridx, fidx), fidx);
     }
   }
 };
 
+template <typename EncAccessor>
 struct EllpackLoader {
   EllpackDeviceAccessor matrix;
-  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor m, bool, bst_feature_t, bst_idx_t, float)
-      : matrix{std::move(m)} {}
+  EncAccessor acc;
+
+  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor m, bool /*use_shared*/,
+                               bst_feature_t /*n_features*/, bst_idx_t /*n_samples*/,
+                               float /*missing*/, EncAccessor&& acc)
+      : matrix{std::move(m)}, acc{std::forward<EncAccessor>(acc)} {}
   [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
     auto gidx = matrix.GetBinIndex<false>(ridx, fidx);
     if (gidx == -1) {
       return std::numeric_limits<float>::quiet_NaN();
     }
     if (common::IsCat(matrix.feature_types, fidx)) {
-      return matrix.gidx_fvalue_map[gidx];
+      return this->acc(matrix.gidx_fvalue_map[gidx], fidx);
     }
     // The gradient index needs to be shifted by one as min values are not included in the
     // cuts.
@@ -168,34 +179,45 @@ struct EllpackLoader {
   [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return this->matrix.n_rows; }
 };
 
-template <typename Batch>
+/**
+ * @brief Use for in-place predict.
+ */
+template <typename Batch, typename EncAccessor>
 struct DeviceAdapterLoader {
-  Batch batch;
-  bst_feature_t columns;
+ private:
+  Batch batch_;
+  EncAccessor acc_;
+
+ public:
+  bst_feature_t n_features;
   float* smem;
   bool use_shared;
   data::IsValidFunctor is_valid;
 
+
   using BatchT = Batch;
 
-  XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
-                                         bst_feature_t num_features, bst_idx_t num_rows,
-                                         float missing)
-      : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
+  XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch&& batch, bool use_shared, bst_feature_t n_features,
+                                         bst_idx_t n_samples, float missing, EncAccessor&& acc)
+      : batch_{std::move(batch)},
+        acc_{std::forward<EncAccessor>(acc)},
+        n_features{n_features},
+        use_shared{use_shared},
+        is_valid{missing} {
     extern __shared__ float _smem[];
-    smem = _smem;
-    if (use_shared) {
-      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-      size_t shared_elements = blockDim.x * num_features;
+    this->smem = _smem;
+    if (this->use_shared) {
+      auto global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      size_t shared_elements = blockDim.x * n_features;
       dh::BlockFill(smem, shared_elements, std::numeric_limits<float>::quiet_NaN());
       __syncthreads();
-      if (global_idx < num_rows) {
-        auto beg = global_idx * columns;
-        auto end = (global_idx + 1) * columns;
+      if (global_idx < n_samples) {
+        auto beg = global_idx * n_features;
+        auto end = (global_idx + 1) * n_features;
         for (size_t i = beg; i < end; ++i) {
-          auto value = batch.GetElement(i).value;
-          if (is_valid(value)) {
-            smem[threadIdx.x * num_features + (i - beg)] = value;
+          data::COOTuple const& e = this->batch_.GetElement(i);
+          if (is_valid(e)) {
+            smem[threadIdx.x * n_features + (i - beg)] = this->acc_(e);
           }
         }
       }
@@ -205,11 +227,11 @@ struct DeviceAdapterLoader {
 
   [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
     if (use_shared) {
-      return smem[threadIdx.x * columns + fidx];
+      return smem[threadIdx.x * n_features + fidx];
     }
-    auto value = batch.GetElement(ridx * columns + fidx).value;
+    auto value = this->batch_.GetElement(ridx * n_features + fidx).value;
     if (is_valid(value)) {
-      return value;
+      return this->acc_(value, fidx);
     } else {
       return std::numeric_limits<float>::quiet_NaN();
     }
@@ -241,7 +263,7 @@ __device__ float GetLeafWeight(bst_idx_t ridx, TreeView const &tree,
   return tree.d_tree[nidx].LeafValue();
 }
 
-template <typename Loader, typename Data>
+template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
 __global__ void
 PredictLeafKernel(Data data, common::Span<const RegTree::Node> d_nodes,
                   common::Span<float> d_out_predictions,
@@ -254,12 +276,12 @@ PredictLeafKernel(Data data, common::Span<const RegTree::Node> d_nodes,
 
                   bst_tree_t tree_begin, bst_tree_t tree_end, bst_feature_t num_features,
                   size_t num_rows, bool use_shared,
-                  float missing) {
+                  float missing, EncAccessor acc) {
   bst_idx_t ridx = blockDim.x * blockIdx.x + threadIdx.x;
   if (ridx >= num_rows) {
     return;
   }
-  Loader loader{data, use_shared, num_features, num_rows, missing};
+  Loader loader{std::move(data), use_shared, num_features, num_rows, missing, std::move(acc)};
   for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
     TreeView d_tree{
         tree_begin,          tree_idx,           d_nodes,
@@ -268,15 +290,15 @@ PredictLeafKernel(Data data, common::Span<const RegTree::Node> d_nodes,
 
     bst_node_t leaf = -1;
     if (d_tree.HasCategoricalSplit()) {
-      leaf = GetLeafIndex<true, true>(ridx, d_tree, &loader);
+      leaf = GetLeafIndex<has_missing, true>(ridx, d_tree, &loader);
     } else {
-      leaf = GetLeafIndex<true, false>(ridx, d_tree, &loader);
+      leaf = GetLeafIndex<has_missing, false>(ridx, d_tree, &loader);
     }
     d_out_predictions[ridx * (tree_end - tree_begin) + tree_idx] = leaf;
   }
 }
 
-template <typename Loader, typename Data, bool has_missing = true>
+template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
 __global__ void
 PredictKernel(Data data, common::Span<const RegTree::Node> d_nodes,
               common::Span<float> d_out_predictions,
@@ -286,10 +308,10 @@ PredictKernel(Data data, common::Span<const RegTree::Node> d_nodes,
               common::Span<uint32_t const> d_cat_tree_segments,
               common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
               common::Span<uint32_t const> d_categories, bst_tree_t tree_begin,
-              bst_tree_t tree_end, size_t num_features, size_t num_rows,
-              bool use_shared, int num_group, float missing) {
+              bst_tree_t tree_end, bst_feature_t num_features, size_t num_rows,
+              bool use_shared, int num_group, float missing, EncAccessor acc) {
   bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-  Loader loader(data, use_shared, num_features, num_rows, missing);
+  Loader loader{std::move(data), use_shared, num_features, num_rows, missing, std::move(acc)};
   if (global_idx >= num_rows) return;
 
   if (num_group == 1) {
@@ -332,11 +354,13 @@ class DeviceModel {
   HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment> categories_node_segments;
   HostDeviceVector<uint32_t> categories;
 
-  size_t tree_beg_;  // NOLINT
-  size_t tree_end_;  // NOLINT
+  bst_tree_t tree_beg_;  // NOLINT
+  bst_tree_t tree_end_;  // NOLINT
   int num_group;
+  CatContainer const* cat_enc{nullptr};
 
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
+  void Init(const gbm::GBTreeModel& model, bst_tree_t tree_begin, bst_tree_t tree_end,
+            DeviceOrd device) {
     dh::safe_cuda(cudaSetDevice(device.ordinal));
 
     // Copy decision trees to device
@@ -406,17 +430,21 @@ class DeviceModel {
     this->tree_beg_ = tree_begin;
     this->tree_end_ = tree_end;
     this->num_group = model.learner_model_param->OutputLength();
+
+    this->cat_enc = model.Cats();
+    CHECK(this->cat_enc);
   }
 };
 
 struct ShapSplitCondition {
   ShapSplitCondition() = default;
   XGBOOST_DEVICE
-  ShapSplitCondition(float feature_lower_bound, float feature_upper_bound,
-                     bool is_missing_branch, common::CatBitField cats)
+  ShapSplitCondition(float feature_lower_bound, float feature_upper_bound, bool is_missing_branch,
+                     common::CatBitField cats)
       : feature_lower_bound(feature_lower_bound),
         feature_upper_bound(feature_upper_bound),
-        is_missing_branch(is_missing_branch), categories{std::move(cats)} {
+        is_missing_branch(is_missing_branch),
+        categories{std::move(cats)} {
     assert(feature_lower_bound <= feature_upper_bound);
   }
 
@@ -624,7 +652,7 @@ __global__ void MaskBitVectorKernel(
     bst_tree_t tree_begin, bst_tree_t tree_end, bst_feature_t num_features, std::size_t num_rows,
     std::size_t num_nodes, bool use_shared, float missing) {
   // This needs to be always instantiated since the data is loaded cooperatively by all threads.
-  SparsePageLoader loader{data, use_shared, num_features, num_rows, missing};
+  SparsePageLoader loader{data, use_shared, num_features, num_rows, missing, NoOpAccessor{}};
   auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_idx >= num_rows) {
     return;
@@ -841,99 +869,209 @@ class ColumnSplitHelper {
 
   Context const* ctx_;
 };
-}  // anonymous namespace
 
-class GPUPredictor : public xgboost::Predictor {
+auto MakeCatAccessor(Context const* ctx, enc::DeviceColumnsView const& new_enc,
+                     DeviceModel const& model) {
+  dh::DeviceUVector<std::int32_t> mapping(new_enc.n_total_cats);
+  auto d_sorted_idx = model.cat_enc->RefSortedIndex(ctx);
+  auto orig_enc = model.cat_enc->DeviceView(ctx);
+  enc::Recode(cuda_impl::EncPolicy, orig_enc, d_sorted_idx, new_enc, dh::ToSpan(mapping));
+  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
+  auto cats_mapping = enc::MappingView{new_enc.feature_segments, dh::ToSpan(mapping)};
+  auto acc = CatAccessor{cats_mapping};
+  return std::tuple{acc, std::move(mapping)};
+}
+
+template <typename EncAccessor>
+struct ShapSparsePageView {
+  SparsePageView data;
+  EncAccessor acc;
+
+  template <typename Fidx>
+  [[nodiscard]] __device__ float GetElement(bst_idx_t ridx, Fidx fidx) const {
+    auto fvalue = data.GetElement(ridx, fidx);
+    return acc(fvalue, fidx);
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return data.NumRows(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return data.NumCols(); }
+};
+
+template <typename Kernel>
+void LaunchPredictKernel(Context const* ctx, bool is_dense, enc::DeviceColumnsView const& new_enc,
+                         DeviceModel const& model, Kernel&& launch) {
+  if (is_dense) {
+    auto is_dense = std::true_type{};
+    if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model);
+      launch(is_dense, std::move(acc));
+    } else {
+      launch(is_dense, NoOpAccessor{});
+    }
+  } else {
+    auto is_dense = std::false_type{};
+    if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model);
+      launch(is_dense, std::move(acc));
+    } else {
+      launch(is_dense, NoOpAccessor{});
+    }
+  }
+}
+
+// provide configuration for launching the predict kernel.
+template <std::uint32_t kBlockThreads = 128, bool kUseShared = true>
+class LaunchConfig {
  private:
-  void PredictInternal(const SparsePage& batch, DeviceModel const& model, size_t num_features,
-                       HostDeviceVector<bst_float>* predictions, size_t batch_offset,
-                       bool is_dense) const {
-    batch.offset.SetDevice(ctx_->Device());
-    batch.data.SetDevice(ctx_->Device());
-    const uint32_t BLOCK_THREADS = 128;
-    bst_idx_t num_rows = batch.Size();
-    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
-    size_t shared_memory_bytes =
-        SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
-    bool use_shared = shared_memory_bytes != 0;
-
-    SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                        num_features);
-    auto const kernel = [&](auto predict_fn) {
-      dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
-          predict_fn, data, model.nodes.ConstDeviceSpan(),
+  static auto constexpr NotSet() { return std::numeric_limits<bst_idx_t>::max(); }
+
+  Context const* ctx_;
+  std::size_t const shared_memory_bytes_;
+  bst_idx_t n_samples_{NotSet()};
+
+  template <typename K, typename... Args>
+  void LaunchImpl(K&& kernel, Args&&... args) const&& {
+    CHECK_NE(this->n_samples_, NotSet());
+    auto grid = static_cast<uint32_t>(common::DivRoundUp(this->n_samples_, kBlockThreads));
+    dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes_, ctx_->CUDACtx()->Stream()}(
+        kernel, std::forward<Args>(args)...);
+  }
+
+  [[nodiscard]] LaunchConfig Grid(bst_idx_t n_samples) const {
+    LaunchConfig cfg = *this;
+    cfg.n_samples_ = n_samples;
+    return cfg;
+  }
+  [[nodiscard]] bool UseShared() const { return shared_memory_bytes_ != 0; }
+
+  [[nodiscard]] static std::size_t ConfigureDevice(DeviceOrd const& device) {
+    thread_local std::unordered_map<std::int32_t, std::size_t> max_shared;
+    if (device.IsCUDA()) {
+      auto it = max_shared.find(device.ordinal);
+      if (it == max_shared.cend()) {
+        max_shared[device.ordinal] = dh::MaxSharedMemory(device.ordinal);
+        it = max_shared.find(device.ordinal);
+      }
+      return it->second;
+    }
+    return 0;
+  }
+
+ public:
+  LaunchConfig(Context const* ctx, bst_feature_t n_features)
+      : ctx_{ctx},
+        shared_memory_bytes_{kUseShared ? SharedMemoryBytes<kBlockThreads>(
+                                              n_features, ConfigureDevice(ctx->Device()))
+                                        : 0} {}
+
+  template <template <typename> typename Loader, typename Data>
+  void LaunchPredict(Context const* ctx, Data data, float missing, bst_idx_t n_samples,
+                     bst_feature_t n_features, DeviceModel const& model, bool is_dense,
+                     enc::DeviceColumnsView const& new_enc, bst_idx_t batch_offset,
+                     HostDeviceVector<bst_float>* predictions) const {
+    LaunchPredictKernel(ctx, is_dense, new_enc, model, [&](auto is_dense, auto&& acc) {
+      constexpr bool kHasMissing = !std::is_same_v<decltype(is_dense), std::true_type>;
+      using EncAccessor = std::remove_reference_t<decltype(acc)>;
+      auto kernel = PredictKernel<Loader<EncAccessor>, Data, kHasMissing, EncAccessor>;
+      this->Grid(n_samples).LaunchImpl(
+          std::move(kernel), std::move(data), model.nodes.ConstDeviceSpan(),
           predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
-          model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
-          model.categories_tree_segments.ConstDeviceSpan(),
+
+          model.tree_group.ConstDeviceSpan(),
+
+          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
           model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
-          model.tree_beg_, model.tree_end_, num_features, num_rows, use_shared, model.num_group,
-          std::numeric_limits<float>::quiet_NaN());
-    };
-    if (is_dense) {
-      kernel(PredictKernel<SparsePageLoader, SparsePageView, false>);
-    } else {
-      kernel(PredictKernel<SparsePageLoader, SparsePageView, true>);
-    }
+
+          model.tree_beg_, model.tree_end_, n_features, n_samples, this->UseShared(),
+          model.num_group, missing, std::forward<EncAccessor>(acc));
+    });
   }
 
-  void PredictInternal(EllpackDeviceAccessor const& batch, DeviceModel const& model,
-                       HostDeviceVector<bst_float>* out_preds, bst_idx_t batch_offset) const {
-    const uint32_t BLOCK_THREADS = 256;
-    size_t num_rows = batch.n_rows;
-    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
-    DeviceModel d_model;
+  template <template <typename> typename Loader, typename Data>
+  void LaunchLeaf(Context const* ctx, Data data, bst_idx_t n_samples, bst_feature_t n_features,
+                  DeviceModel const& model, bool is_dense, enc::DeviceColumnsView const& new_enc,
+                  bst_idx_t batch_offset, HostDeviceVector<bst_float>* predictions) const {
+    LaunchPredictKernel(ctx, is_dense, new_enc, model, [&](auto is_dense, auto&& acc) {
+      constexpr bool kHasMissing = !std::is_same_v<decltype(is_dense), std::true_type>;
+      using EncAccessor = std::remove_reference_t<decltype(acc)>;
+      auto kernel = PredictLeafKernel<Loader<EncAccessor>, Data, kHasMissing, EncAccessor>;
+      this->Grid(n_samples).LaunchImpl(
+          std::move(kernel), std::move(data), model.nodes.ConstDeviceSpan(),
+          predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
 
-    bool use_shared = false;
-    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, 0, ctx_->CUDACtx()->Stream()}(
-        PredictKernel<EllpackLoader, EllpackDeviceAccessor>, batch, model.nodes.ConstDeviceSpan(),
-        out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
-        model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
-        model.categories_tree_segments.ConstDeviceSpan(),
-        model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
-        model.tree_beg_, model.tree_end_, batch.NumFeatures(), num_rows, use_shared,
-        model.num_group, std::numeric_limits<float>::quiet_NaN());
+          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
+          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
+
+          model.tree_beg_, model.tree_end_, n_features, n_samples, this->UseShared(),
+          std::numeric_limits<float>::quiet_NaN(), std::forward<EncAccessor>(acc));
+    });
   }
+};
 
-  void DevicePredictInternal(DMatrix* dmat, HostDeviceVector<float>* out_preds,
-                             const gbm::GBTreeModel& model, size_t tree_begin,
-                             size_t tree_end) const {
+template <typename Kernel>
+void LaunchShapKernel(Context const* ctx, enc::DeviceColumnsView const& new_enc,
+                      DeviceModel const& model, Kernel launch) {
+  if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
+    auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model);
+    launch(std::move(acc));
+  } else {
+    launch(NoOpAccessor{});
+  }
+}
+}  // anonymous namespace
+
+class GPUPredictor : public xgboost::Predictor {
+ private:
+  void PredictDMatrix(DMatrix* p_fmat, HostDeviceVector<float>* out_preds,
+                      gbm::GBTreeModel const& model, bst_tree_t tree_begin,
+                      bst_tree_t tree_end) const {
     if (tree_end - tree_begin == 0) {
       return;
     }
     out_preds->SetDevice(ctx_->Device());
-    auto const& info = dmat->Info();
+    auto const& info = p_fmat->Info();
     DeviceModel d_model;
     d_model.Init(model, tree_begin, tree_end, ctx_->Device());
 
     if (info.IsColumnSplit()) {
-      column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
+      column_split_helper_.PredictBatch(p_fmat, out_preds, model, d_model);
       return;
     }
 
-    CHECK_LE(dmat->Info().num_col_, model.learner_model_param->num_feature);
-    if (dmat->PageExists<SparsePage>()) {
+    CHECK_LE(p_fmat->Info().num_col_, model.learner_model_param->num_feature);
+    auto new_enc = p_fmat->Cats()->DeviceView(ctx_);
+    if (p_fmat->PageExists<SparsePage>()) {
       bst_idx_t batch_offset = 0;
-      for (auto& batch : dmat->GetBatches<SparsePage>()) {
-        this->PredictInternal(batch, d_model, model.learner_model_param->num_feature, out_preds,
-                              batch_offset, dmat->IsDense());
-        batch_offset += batch.Size() * model.learner_model_param->OutputLength();
+      for (auto& page : p_fmat->GetBatches<SparsePage>()) {
+        page.offset.SetDevice(ctx_->Device());
+        page.data.SetDevice(ctx_->Device());
+        auto n_features = model.learner_model_param->num_feature;
+        LaunchConfig cfg{ctx_, n_features};
+        SparsePageView data(page.data.DeviceSpan(), page.offset.DeviceSpan(), n_features);
+        cfg.LaunchPredict<SparsePageLoader>(
+            this->ctx_, std::move(data), std::numeric_limits<float>::quiet_NaN(), page.Size(),
+            n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
+        batch_offset += page.Size() * model.learner_model_param->OutputLength();
       }
     } else {
+      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
+      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
+
       bst_idx_t batch_offset = 0;
-      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        dmat->Info().feature_types.SetDevice(ctx_->Device());
-        auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
-        this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
-                              out_preds, batch_offset);
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+        auto batch = page.Impl()->GetDeviceAccessor(ctx_, feature_types);
+        // No shared memory use for ellpack
+        bst_feature_t n_features = batch.NumFeatures();
+        LaunchConfig<256, false> cfg{this->ctx_, n_features};
+        cfg.LaunchPredict<EllpackLoader>(
+            this->ctx_, std::move(batch), std::numeric_limits<float>::quiet_NaN(), page.Size(),
+            n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
         batch_offset += page.Size() * model.learner_model_param->OutputLength();
       }
     }
   }
 
  public:
-  explicit GPUPredictor(Context const* ctx)
-      : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
+  explicit GPUPredictor(Context const* ctx) : Predictor{ctx}, column_split_helper_{ctx} {}
 
   ~GPUPredictor() override {
     if (ctx_->IsCUDA() && ctx_->Ordinal() < curt::AllVisibleGPUs()) {
@@ -948,16 +1086,21 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_end == 0) {
       tree_end = model.trees.size();
     }
-    this->DevicePredictInternal(dmat, out_preds, model, tree_begin, tree_end);
+    this->PredictDMatrix(dmat, out_preds, model, tree_begin, tree_end);
   }
 
-  template <typename Adapter, typename Loader>
+  // Fill the `BatchT` parameter, currying for template.
+  template <typename BatchT>
+  struct PartialLoader {
+    template <typename T>
+    using Type = DeviceAdapterLoader<BatchT, T>;
+  };
+
+  template <typename Adapter>
   void DispatchedInplacePredict(std::any const& x, std::shared_ptr<DMatrix> p_m,
                                 const gbm::GBTreeModel& model, float missing,
                                 PredictionCacheEntry* out_preds, bst_tree_t tree_begin,
                                 bst_tree_t tree_end) const {
-    uint32_t const output_groups =  model.learner_model_param->num_output_group;
-
     auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
@@ -973,25 +1116,31 @@ class GPUPredictor : public xgboost::Predictor {
       this->InitOutPredictions(info, &(out_preds->predictions), model);
     }
     out_preds->predictions.SetDevice(m->Device());
+    using BatchT =
+        std::remove_cv_t<std::remove_reference_t<decltype(std::declval<Adapter>().Value())>>;
 
-    const uint32_t BLOCK_THREADS = 128;
-    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
+    auto n_samples = m->NumRows();
+    auto n_features = model.learner_model_param->num_feature;
+    LaunchConfig cfg{ctx_, n_features};
 
-    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
-    size_t shared_memory_bytes =
-        SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
     DeviceModel d_model;
     d_model.Init(model, tree_begin, tree_end, m->Device());
 
-    bool use_shared = shared_memory_bytes != 0;
-
-    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
-        PredictKernel<Loader, typename Loader::BatchT>, m->Value(), d_model.nodes.ConstDeviceSpan(),
-        out_preds->predictions.DeviceSpan(), d_model.tree_segments.ConstDeviceSpan(),
-        d_model.tree_group.ConstDeviceSpan(), d_model.split_types.ConstDeviceSpan(),
-        d_model.categories_tree_segments.ConstDeviceSpan(),
-        d_model.categories_node_segments.ConstDeviceSpan(), d_model.categories.ConstDeviceSpan(),
-        tree_begin, tree_end, m->NumColumns(), m->NumRows(), use_shared, output_groups, missing);
+    if constexpr (std::is_same_v<Adapter, data::CudfAdapter>) {
+      if (m->HasCategorical()) {
+        // FIXME(jiamingy): Remove this container construction once cuDF can return device
+        // arrow.
+        auto container = std::make_shared<CatContainer>(m->Device(), m->Cats());
+        auto new_enc = container->DeviceView(this->ctx_);
+        cfg.LaunchPredict<PartialLoader<BatchT>::template Type>(
+            this->ctx_, m->Value(), missing, n_samples, n_features, d_model, false, new_enc, 0,
+            &out_preds->predictions);
+        return;
+      }
+    }
+    cfg.LaunchPredict<PartialLoader<BatchT>::template Type>(
+        this->ctx_, m->Value(), missing, n_samples, n_features, d_model, false,
+        enc::DeviceColumnsView{}, 0, &out_preds->predictions);
   }
 
   bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const& model, float missing,
@@ -1001,13 +1150,17 @@ class GPUPredictor : public xgboost::Predictor {
     CHECK(proxy) << error::InplacePredictProxy();
     auto x = proxy->Adapter();
     if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
-      this->DispatchedInplacePredict<data::CupyAdapter,
-                                     DeviceAdapterLoader<data::CupyAdapterBatch>>(
-          x, p_m, model, missing, out_preds, tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::CupyAdapter>(x, p_m, model, missing, out_preds,
+                                                        tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::CudfAdapter>)) {
-      this->DispatchedInplacePredict<data::CudfAdapter,
-                                     DeviceAdapterLoader<data::CudfAdapterBatch>>(
-          x, p_m, model, missing, out_preds, tree_begin, tree_end);
+      auto m = std::any_cast<std::shared_ptr<data::CudfAdapter>>(x);
+      if (m->HasCategorical()) {
+        this->DispatchedInplacePredict<data::CudfAdapter>(x, p_m, model, missing, out_preds,
+                                                          tree_begin, tree_end);
+      } else {
+        this->DispatchedInplacePredict<data::CudfAdapter>(x, p_m, model, missing, out_preds,
+                                                          tree_begin, tree_end);
+      }
     } else {
       return false;
     }
@@ -1042,30 +1195,46 @@ class GPUPredictor : public xgboost::Predictor {
     out_contribs->Fill(0.0f);
     auto phis = out_contribs->DeviceSpan();
 
-    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
-        device_paths;
+    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> device_paths;
     DeviceModel d_model;
     d_model.Init(model, 0, tree_end, ctx_->Device());
+    auto new_enc = p_fmat->Cats()->DeviceView(this->ctx_);
+
     dh::device_vector<uint32_t> categories;
     ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
+
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->Device());
-        batch.offset.SetDevice(ctx_->Device());
-        SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                         model.learner_model_param->num_feature);
         auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
-        gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
-            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        LaunchShapKernel(this->ctx_, new_enc, d_model, [&](auto&& acc) {
+          batch.data.SetDevice(ctx_->Device());
+          batch.offset.SetDevice(ctx_->Device());
+          SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
+                           model.learner_model_param->num_feature);
+          using EncAccessor = std::remove_reference_t<decltype(acc)>;
+          auto loader = ShapSparsePageView<EncAccessor>{X, std::forward<EncAccessor>(acc)};
+          gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
+              loader, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        });
       }
     } else {
+      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
+      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
+
       for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
-        auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
-                               std::numeric_limits<float>::quiet_NaN()};
+        EllpackDeviceAccessor ellpack{batch.Impl()->GetDeviceAccessor(ctx_, feature_types)};
         auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
-        gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
-            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        LaunchShapKernel(this->ctx_, new_enc, d_model, [&](auto&& acc) {
+          using EncAccessor = std::remove_reference_t<decltype(acc)>;
+          auto X = EllpackLoader{ellpack,
+                                 true,
+                                 model.learner_model_param->num_feature,
+                                 batch.Size(),
+                                 std::numeric_limits<float>::quiet_NaN(),
+                                 std::forward<EncAccessor>(acc)};
+          gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
+              X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        });
       }
     }
 
@@ -1100,39 +1269,55 @@ class GPUPredictor : public xgboost::Predictor {
     const int ngroup = model.learner_model_param->num_output_group;
     CHECK_NE(ngroup, 0);
     // allocate space for (number of features + bias) times the number of rows
-    size_t contributions_columns =
-        model.learner_model_param->num_feature + 1;  // +1 for bias
+    size_t contributions_columns = model.learner_model_param->num_feature + 1;  // +1 for bias
     auto dim_size =
         contributions_columns * contributions_columns * model.learner_model_param->num_output_group;
     out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
     out_contribs->Fill(0.0f);
     auto phis = out_contribs->DeviceSpan();
 
-    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
-        device_paths;
+    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> device_paths;
     DeviceModel d_model;
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
     ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
+    auto new_enc = p_fmat->Cats()->DeviceView(ctx_);
+
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->Device());
-        batch.offset.SetDevice(ctx_->Device());
-        SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                         model.learner_model_param->num_feature);
         auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
-        gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
-            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        auto launch = [&](auto&& acc) {
+          batch.data.SetDevice(ctx_->Device());
+          batch.offset.SetDevice(ctx_->Device());
+          SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
+                           model.learner_model_param->num_feature);
+          using EncAccessor = std::remove_reference_t<decltype(acc)>;
+          auto loader = ShapSparsePageView<EncAccessor>{X, std::forward<EncAccessor>(acc)};
+          gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
+              loader, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        };
+        LaunchShapKernel(this->ctx_, new_enc, d_model, launch);
       }
     } else {
+      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
+      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
+
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
         auto impl = batch.Impl();
-        auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
+        auto ellpack = impl->GetDeviceAccessor(ctx_, feature_types);
         auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
-        auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
-                               std::numeric_limits<float>::quiet_NaN()};
-        gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
-            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        auto launch = [&](auto&& acc) {
+          using EncAccessor = std::remove_reference_t<decltype(acc)>;
+          auto X = EllpackLoader{ellpack,
+                                 /*use_shared=*/false,
+                                 model.learner_model_param->num_feature,
+                                 batch.Size(),
+                                 std::numeric_limits<float>::quiet_NaN(),
+                                 std::forward<EncAccessor>(acc)};
+          gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
+              X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+        };
+        LaunchShapKernel(this->ctx_, new_enc, d_model, launch);
       }
     }
 
@@ -1155,13 +1340,13 @@ class GPUPredictor : public xgboost::Predictor {
   void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<float>* predictions,
                    gbm::GBTreeModel const& model, bst_tree_t tree_end) const override {
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
+
 
     const MetaInfo& info = p_fmat->Info();
-    bst_idx_t num_rows = info.num_row_;
+    bst_idx_t n_samples = info.num_row_;
     tree_end = GetTreeLimit(model.trees, tree_end);
     predictions->SetDevice(ctx_->Device());
-    predictions->Resize(num_rows * tree_end);
+    predictions->Resize(n_samples * tree_end);
     DeviceModel d_model;
     d_model.Init(model, 0, tree_end, this->ctx_->Device());
 
@@ -1170,23 +1355,9 @@ class GPUPredictor : public xgboost::Predictor {
       return;
     }
 
-    constexpr uint32_t kBlockThreads = 128;
-    size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
-        info.num_col_, max_shared_memory_bytes);
-    bool use_shared = shared_memory_bytes != 0;
-    bst_feature_t num_features = info.num_col_;
-
-    auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
-      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
-          fn, data, d_model.nodes.ConstDeviceSpan(),
-          predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
-
-          d_model.split_types.ConstDeviceSpan(), d_model.categories_tree_segments.ConstDeviceSpan(),
-          d_model.categories_node_segments.ConstDeviceSpan(), d_model.categories.ConstDeviceSpan(),
-
-          d_model.tree_beg_, d_model.tree_end_, num_features, num_rows, use_shared,
-          std::numeric_limits<float>::quiet_NaN());
-    };
+    bst_feature_t n_features = info.num_col_;
+    auto new_enc = p_fmat->Cats()->DeviceView(this->ctx_);
+    LaunchConfig cfg{this->ctx_, n_features};
 
     if (p_fmat->PageExists<SparsePage>()) {
       bst_idx_t batch_offset = 0;
@@ -1195,30 +1366,27 @@ class GPUPredictor : public xgboost::Predictor {
         batch.offset.SetDevice(ctx_->Device());
         SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                             model.learner_model_param->num_feature};
-        auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
-        launch(PredictLeafKernel<SparsePageLoader, SparsePageView>, grid, data, batch_offset);
+        cfg.LaunchLeaf<SparsePageLoader>(this->ctx_, std::move(data), batch.Size(), n_features,
+                                         d_model, p_fmat->IsDense(), new_enc, batch_offset,
+                                         predictions);
         batch_offset += batch.Size();
       }
     } else {
+      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
+      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
+
       bst_idx_t batch_offset = 0;
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
-        auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
-        launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_, feature_types)};
+        cfg.LaunchLeaf<EllpackLoader>(this->ctx_, std::move(data), batch.Size(), n_features,
+                                      d_model, p_fmat->IsDense(), new_enc, batch_offset,
+                                      predictions);
         batch_offset += batch.Size();
       }
     }
   }
 
  private:
-  /*! \brief Reconfigure the device when GPU is changed. */
-  static size_t ConfigureDevice(DeviceOrd device) {
-    if (device.IsCUDA()) {
-      return dh::MaxSharedMemory(device.ordinal);
-    }
-    return 0;
-  }
-
   ColumnSplitHelper column_split_helper_;
 };
 
diff --git a/tests/cpp/encoder/test_ordinal.cc b/tests/cpp/encoder/test_ordinal.cc
index 70b491722726..487853d88055 100644
--- a/tests/cpp/encoder/test_ordinal.cc
+++ b/tests/cpp/encoder/test_ordinal.cc
@@ -6,7 +6,9 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-#include <vector>
+#include <cstdint>  // for int32_t
+#include <sstream>  // for stringstream
+#include <vector>   // for vector
 
 #include "../../../src/encoder/ordinal.h"
 #include "df_mock.h"  // for DfTest
@@ -32,4 +34,17 @@ TEST(CategoricalEncoder, Int) { TestOrdinalEncoderInts<OrdRecoderTest, DfTest>()
 TEST(CategoricalEncoder, Mixed) { TestOrdinalEncoderMixed<OrdRecoderTest, DfTest>(); }
 
 TEST(CategoricalEncoder, Empty) { TestOrdinalEncoderEmpty<OrdRecoderTest, DfTest>(); }
+
+TEST(CategoricalEncoder, Print) {
+  auto df = DfTest::Make(DfTest::MakeInts(0, 1), DfTest::MakeStrs("cbd", "bbd", "dbd", "ab"),
+                         DfTest::MakeInts(2, 3));
+  std::stringstream ss;
+  ss << df.View();
+  auto str = ss.str();
+  auto sol = R"(f0: [0, 1]
+f1: [cbd, bbd, dbd, ab]
+f2: [2, 3]
+)";
+  ASSERT_EQ(sol, str);
+}
 }  // namespace enc
diff --git a/tests/cpp/encoder/test_ordinal.h b/tests/cpp/encoder/test_ordinal.h
index 6b53d323b1b6..fe9663e540f8 100644
--- a/tests/cpp/encoder/test_ordinal.h
+++ b/tests/cpp/encoder/test_ordinal.h
@@ -104,6 +104,18 @@ void TestOrdinalEncoderInts() {
     ASSERT_THAT([&] { encoder.Recode(orig_dict, new_dict, new_df.MappingView()); },
                 ::testing::ThrowsMessage<std::logic_error>(::testing::HasSubstr("`5`")));
   }
+  {
+    auto df = DfTest::Make(DfTest::MakeInts(0), DfTest::MakeInts(0, 1));
+    auto orig_dict = df.View();
+
+    auto new_df = DfTest::Make(DfTest::MakeInts(0), DfTest::MakeInts(0, 1));
+    auto new_dict = new_df.View();
+
+    encoder.Recode(orig_dict, new_dict, new_df.MappingView());
+    auto mapping = new_df.Mapping();
+    std::vector<std::int32_t> sol{0, 0, 1};
+    ASSERT_EQ(mapping, sol);
+  }
 }
 
 template <typename Encoder, typename DfTest>
diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py
index 174277c7ec3d..e0b3f2583f4a 100644
--- a/tests/python-gpu/test_gpu_linear.py
+++ b/tests/python-gpu/test_gpu_linear.py
@@ -39,7 +39,8 @@ class TestGPULinear:
     @settings(deadline=None, max_examples=20, print_blob=True)
     def test_gpu_coordinate(self, param, num_rounds, dataset):
         assume(len(dataset.y) > 0)
-        param["updater"] = "gpu_coord_descent"
+        param["updater"] = "coord_descent"
+        param["device"] = "cuda"
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)["train"][
             dataset.metric
@@ -60,7 +61,8 @@ def test_gpu_coordinate(self, param, num_rounds, dataset):
     @settings(deadline=None, max_examples=20, print_blob=True)
     def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
         assume(len(dataset.y) > 0)
-        param["updater"] = "gpu_coord_descent"
+        param["updater"] = "coord_descent"
+        param["device"] = "cuda"
         param["alpha"] = alpha
         param["lambda"] = lambd
         param = dataset.set_params(param)
@@ -78,7 +80,8 @@ def test_gpu_coordinate_from_cupy(self):
 
         params = {
             "booster": "gblinear",
-            "updater": "gpu_coord_descent",
+            "updater": "coord_descent",
+            "device": "cuda",
             "n_estimators": 100,
         }
         X, y = tm.get_california_housing()
diff --git a/tests/python-gpu/test_gpu_ordinal.py b/tests/python-gpu/test_gpu_ordinal.py
index c59c7b4d020e..063f4c602fb7 100644
--- a/tests/python-gpu/test_gpu_ordinal.py
+++ b/tests/python-gpu/test_gpu_ordinal.py
@@ -1,10 +1,23 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from typing import Type
+
+import numpy as np
 import pytest
 
+import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import make_categorical
 from xgboost.testing.ordinal import (
     run_cat_container,
     run_cat_container_iter,
     run_cat_container_mixed,
+    run_cat_invalid,
+    run_cat_leaf,
+    run_cat_predict,
+    run_cat_shap,
+    run_cat_thread_safety,
+    run_specified_cat,
 )
 
 pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_cudf()))
@@ -20,3 +33,74 @@ def test_cat_container_mixed() -> None:
 
 def test_cat_container_iter() -> None:
     run_cat_container_iter("cuda")
+
+
+def test_cat_predict() -> None:
+    run_cat_predict("cuda")
+
+
+def test_cat_invalid() -> None:
+    run_cat_invalid("cuda")
+
+
+def test_cat_thread_safety() -> None:
+    run_cat_thread_safety("cuda")
+
+
+def test_cat_shap() -> None:
+    run_cat_shap("cuda")
+
+
+def test_cat_leaf() -> None:
+    run_cat_leaf("cuda")
+
+
+def test_mixed_devices() -> None:
+    n_samples = 128
+    n_features = 4
+    X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cpu")
+
+    def run_cpu_gpu(DMatrixT: Type):
+        Xy = DMatrixT(X, y, enable_categorical=True)
+        booster = xgb.train({"tree_method": "hist", "device": "cuda"}, Xy)
+        predt0 = booster.inplace_predict(X)
+        predt1 = booster.predict(DMatrixT(X, y, enable_categorical=True))
+
+        np.testing.assert_allclose(predt0, predt1)
+        return True
+
+    n_cpus = os.cpu_count()
+    assert n_cpus is not None
+
+    futures = []
+    with ThreadPoolExecutor(max_workers=n_cpus) as e:
+        for dm in (xgb.DMatrix, xgb.QuantileDMatrix):
+            f = e.submit(run_cpu_gpu, dm)
+            futures.append(f)
+
+    for f in futures:
+        assert f.result()
+
+    X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cuda")
+
+    def run_gpu_cpu(DMatrixT: Type):
+        Xy = DMatrixT(X, y, enable_categorical=True)
+        booster = xgb.train({"tree_method": "hist", "device": "cpu"}, Xy)
+        predt0 = booster.inplace_predict(X).get()
+        predt1 = booster.predict(DMatrixT(X, y, enable_categorical=True))
+
+        np.testing.assert_allclose(predt0, predt1)
+        return True
+
+    futures = []
+    with ThreadPoolExecutor(max_workers=n_cpus) as e:
+        for dm in (xgb.DMatrix, xgb.QuantileDMatrix):
+            f = e.submit(run_gpu_cpu, dm)
+            futures.append(f)
+
+    for f in futures:
+        assert f.result()
+
+
+def test_spcified_cat() -> None:
+    run_specified_cat("cuda")
diff --git a/tests/python/test_ordinal.py b/tests/python/test_ordinal.py
index 05cd641693a5..a9a1af8467b5 100644
--- a/tests/python/test_ordinal.py
+++ b/tests/python/test_ordinal.py
@@ -10,6 +10,7 @@
     run_cat_predict,
     run_cat_shap,
     run_cat_thread_safety,
+    run_specified_cat,
 )
 
 pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_pandas()))
@@ -45,3 +46,7 @@ def test_cat_shap() -> None:
 
 def test_cat_leaf() -> None:
     run_cat_leaf("cpu")
+
+
+def test_spcified_cat() -> None:
+    run_specified_cat("cpu")

From 967a73a00e11ef0f9a947b0a45b727e728968a9e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 24 Mar 2025 13:52:30 +0800
Subject: [PATCH 015/224] [R] Implement Booster reset. (#11357)

---
 CMakeLists.txt             |  3 +++
 R-package/R/xgb.Booster.R  |  8 ++++++++
 R-package/R/xgb.cv.R       |  6 ++++++
 R-package/R/xgb.train.R    |  2 +-
 R-package/src/init.c       |  2 ++
 R-package/src/xgboost_R.cc | 33 +++++++++++++++++++++++----------
 include/xgboost/logging.h  |  6 ++----
 7 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee18a2afdf96..4a7542d61bb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,6 +145,9 @@ if(R_LIB AND GOOGLE_TEST)
     "Some C++ tests will fail with `R_LIB` enabled, as R package redirects some functions to R runtime implementation."
   )
 endif()
+if(R_LIB AND USE_NCCL)
+  message(SEND_ERROR "`R_LIB` doesn't support distributed computing with NCCL yet.")
+endif()
 if(PLUGIN_RMM AND NOT (USE_CUDA))
   message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
 endif()
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 5c30ffccaa16..485a1bc23830 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -1206,6 +1206,14 @@ xgb.copy.Booster <- function(model) {
   return(.Call(XGDuplicate_R, model))
 }
 
+xgb.reset.Booster <- function(model) {
+  if (!inherits(model, "xgb.Booster")) {
+    stop("'model' must be an 'xgb.Booster' object.")
+  }
+  .Call(XGBoosterReset_R, xgb.get.handle(model))
+  return(model)
+}
+
 #' Check if two boosters share the same C object
 #'
 #' Checks whether two booster objects refer to the same underlying C object.
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index f81fd51dfec8..88217a04d35d 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -273,6 +273,7 @@ xgb.cv <- function(params = xgb.params(), data, nrounds, nfold,
 
     if (should_stop) break
   }
+
   cb_outputs <- .execute.cb.after.training(
     callbacks,
     bst_folds,
@@ -282,6 +283,11 @@ xgb.cv <- function(params = xgb.params(), data, nrounds, nfold,
     msg
   )
 
+  # Just in case if the model is referenced in callbacks.
+  lapply(bst_folds, function(fd) {
+    xgb.reset.Booster(fd$bst)
+  })
+
   # the CV result
   ret <- list(
     call = match.call(),
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index a94f42593121..ad4e9298abe3 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -390,7 +390,7 @@ xgb.train <- function(params = xgb.params(), data, nrounds, evals = list(),
     call = match.call(),
     params = params
   )
-
+  bst <- xgb.reset.Booster(bst)
   curr_attrs <- attributes(bst)
   if (NROW(curr_attrs)) {
     curr_attrs <- curr_attrs[
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 523e5118a6f5..fb40933ccb5b 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -20,6 +20,7 @@ extern SEXP XGDuplicate_R(SEXP);
 extern SEXP XGPointerEqComparison_R(SEXP, SEXP);
 extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGBoosterCreate_R(SEXP);
+extern SEXP XGBoosterReset_R(SEXP);
 extern SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP, SEXP);
 extern SEXP XGBoosterSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterGetStrFeatureInfo_R(SEXP, SEXP);
@@ -83,6 +84,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGPointerEqComparison_R",     (DL_FUNC) &XGPointerEqComparison_R,     2},
   {"XGBoosterTrainOneIter_R",     (DL_FUNC) &XGBoosterTrainOneIter_R,     5},
   {"XGBoosterCreate_R",           (DL_FUNC) &XGBoosterCreate_R,           1},
+  {"XGBoosterReset_R",            (DL_FUNC) &XGBoosterReset_R,            1},
   {"XGBoosterCopyInfoFromDMatrix_R", (DL_FUNC) &XGBoosterCopyInfoFromDMatrix_R, 2},
   {"XGBoosterSetStrFeatureInfo_R",(DL_FUNC) &XGBoosterSetStrFeatureInfo_R,3},  // NOLINT
   {"XGBoosterGetStrFeatureInfo_R",(DL_FUNC) &XGBoosterGetStrFeatureInfo_R,2},  // NOLINT
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 68a45ba96780..90f73343cf21 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -306,18 +306,24 @@ object, need to be destructed before triggering the R error.
 In order to preserve the error message, it gets copied to a temporary
 buffer, and the R error section is reached through a 'goto' statement
 that bypasses usual function control flow. */
-char cpp_ex_msg[512];
+namespace {
+constexpr std::size_t MsgSize = 512;
+char cpp_ex_msg[MsgSize];
+}  // anonymous namespace
+
 /*!
  * \brief macro to annotate end of api
  */
-#define R_API_END()                             \
-  } catch(std::exception &e) {                  \
-    std::strncpy(cpp_ex_msg, e.what(), 512);    \
-    goto throw_cpp_ex_as_R_err;                 \
-  }                                             \
-  if (false) {                                  \
-    throw_cpp_ex_as_R_err:                      \
-    Rf_error("%s", cpp_ex_msg);                 \
+#define R_API_END()                                  \
+  }                                                  \
+  catch (std::exception & e) {                       \
+    cpp_ex_msg[MsgSize - 1] = 0;                     \
+    std::strncpy(cpp_ex_msg, e.what(), MsgSize - 1); \
+    goto throw_cpp_ex_as_R_err;                      \
+  }                                                  \
+  if (false) {                                       \
+  throw_cpp_ex_as_R_err:                             \
+    Rf_error("%s", cpp_ex_msg);                      \
   }
 
 /**
@@ -990,7 +996,7 @@ SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) {
   return R_NilValue; /* <- should not be reached */
 }
 
-SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) {
+SEXP XGBAltrepDeserializer_R(SEXP /*unused*/, SEXP R_state) {
   SEXP R_altrepped_obj = Rf_protect(XGBMakeEmptyAltrep());
   R_API_BEGIN();
   BoosterHandle handle = nullptr;
@@ -1081,6 +1087,13 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
   return out;
 }
 
+XGB_DLL SEXP XGBoosterReset_R(SEXP handle) {
+  R_API_BEGIN();
+  CHECK_CALL(XGBoosterReset(R_ExternalPtrAddr(handle)));
+  R_API_END();
+  return R_NilValue;
+}
+
 XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat) {
   R_API_BEGIN();
   char const **feature_names;
diff --git a/include/xgboost/logging.h b/include/xgboost/logging.h
index 86550cc13f70..eaed72f18bad 100644
--- a/include/xgboost/logging.h
+++ b/include/xgboost/logging.h
@@ -89,10 +89,8 @@ class LogCallbackRegistry {
  public:
   using Callback = void (*)(const char*);
   LogCallbackRegistry() {}
-  inline void Register(Callback log_callback) {}
-  inline Callback Get() const {
-    return nullptr;
-  }
+  inline void Register(Callback) {}
+  inline Callback Get() const { return nullptr; }
 };
 #endif  // !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
 

From 1de7052514d16e46e53f9f1c41ef413cfd738eea Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 24 Mar 2025 18:18:10 +0800
Subject: [PATCH 016/224] [jvm-packages] support columnar input for cpu
 pipeline (#11352)

---
 .../ml/dmlc/xgboost4j/scala/spark/Utils.scala |   1 +
 .../scala/spark/XGBoostEstimator.scala        |  58 ++++++----
 .../scala/spark/params/XGBoostParams.scala    |  23 ++--
 .../dmlc/xgboost4j/scala/spark/PerTest.scala  |   9 ++
 .../scala/spark/XGBoostEstimatorSuite.scala   | 104 ++++++++++++++++++
 .../scala/spark/XGBoostRankerSuite.scala      |   3 +-
 6 files changed, 171 insertions(+), 27 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
index cae44ab9aef1..7671dd723242 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
@@ -112,6 +112,7 @@ private[scala] object Utils {
   val TRAIN_NAME = "train"
   val VALIDATION_NAME = "eval"
 
+  val TMP_FEATURE_ARRAY_NAME = "xgboost_eGdib29zdC1qdm0K_jvm"
 
   /** Executes the provided code block and then closes the resource */
   def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = {
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
index abbffc370224..a5acf2475977 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
@@ -32,7 +32,7 @@ import org.apache.spark.ml.util.{DefaultParamsWritable, MLReader, MLWritable, ML
 import org.apache.spark.ml.xgboost.{SparkUtils, XGBProbabilisticClassifierParams}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.{array, col, udf}
 import org.apache.spark.sql.types._
 
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
@@ -150,7 +150,12 @@ private[spark] trait XGBoostEstimator[
     // Get feature id(s)
     val (featureIds: Option[Seq[Int]], featureId: Option[Int]) =
       if (getFeaturesCols.length != 0) {
-        (Some(getFeaturesCols.map(schema.fieldIndex).toSeq), None)
+        // Columnars has been converted to array
+        if (schema.names.contains(Utils.TMP_FEATURE_ARRAY_NAME)) {
+          (None, Some(schema.fieldIndex(Utils.TMP_FEATURE_ARRAY_NAME)))
+        } else {
+          (Some(getFeaturesCols.map(schema.fieldIndex).toSeq), None)
+        }
       } else {
         (None, Some(schema.fieldIndex(getFeaturesCol)))
       }
@@ -188,30 +193,36 @@ private[spark] trait XGBoostEstimator[
   private[spark] def preprocess(dataset: Dataset[_]): (Dataset[_], ColumnIndices) = {
     val schema = dataset.schema
     validateFeatureType(schema)
-    val featureIsArray: Boolean = featureIsArrayType(schema)
 
     // Columns to be selected for XGBoost training
     val selectedCols: ArrayBuffer[Column] = ArrayBuffer.empty
 
     def selectCol(c: Param[String], targetType: DataType) = {
       if (isDefinedNonEmpty(c)) {
-        if (c == featuresCol) {
-          // If feature is array type, we force to cast it to array of float
-          val featureCol = if (featureIsArray) {
-            col($(featuresCol)).cast(ArrayType(FloatType))
-          } else col($(featuresCol))
-          selectedCols.append(featureCol)
-        } else {
           selectedCols.append(castIfNeeded(schema, $(c), targetType))
-        }
       }
     }
 
-    Seq(labelCol, featuresCol, weightCol, baseMarginCol).foreach(p => selectCol(p, FloatType))
+    Seq(labelCol, weightCol, baseMarginCol).foreach(p => selectCol(p, FloatType))
     this match {
       case p: HasGroupCol => selectCol(p.groupCol, IntegerType)
       case _ =>
     }
+
+    val featureCol = if (isSet(featuresCols)) {
+      // Make columnar to array
+      array(getFeaturesCols.map(col): _*)
+        .cast(ArrayType(FloatType))
+        .alias(Utils.TMP_FEATURE_ARRAY_NAME)
+    } else {
+      if (featureIsArrayType(schema)) {
+        col($(featuresCol)).cast(ArrayType(FloatType))
+      } else {
+        col($(featuresCol))
+      }
+    }
+    selectedCols.append(featureCol)
+
     val repartitioned = repartitionIfNeeded(dataset.select(selectedCols.toArray: _*))
     val sorted = sortPartitionIfNeeded(repartitioned)
     val columnIndices = buildColumnIndices(sorted.schema)
@@ -604,22 +615,30 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML
     if (PluginUtils.isPluginEnabled(dataset)) {
       return PluginUtils.getPlugin.get.transform(this, dataset)
     }
-    validateFeatureType(dataset.schema)
     val (schema, pred) = preprocess(dataset)
+    // Model could be trained with columnar, and the transform df could be array or vector
+    val (input, featureName, featureIsArray) = if (isSet(featuresCols) &&
+      getFeaturesCols.length > 0 &&
+      getFeaturesCols.forall(schema.names.contains)) {
+      (dataset.withColumn(Utils.TMP_FEATURE_ARRAY_NAME,
+        array(getFeaturesCols.map(col): _*).cast(ArrayType(FloatType))),
+        Utils.TMP_FEATURE_ARRAY_NAME,
+        true)
+    } else {
+      (dataset, getFeaturesCol, featureIsArrayType(dataset.schema))
+    }
+
     // Broadcast the booster to each executor.
-    val bBooster = dataset.sparkSession.sparkContext.broadcast(nativeBooster)
+    val bBooster = input.sparkSession.sparkContext.broadcast(nativeBooster)
     // TODO configurable
     val inferBatchSize = 32 << 10
-    val featureName = getFeaturesCol
     val missing = getMissing
 
-    val featureIsArray = featureIsArrayType(dataset.schema)
-
     // Here, we use RDD instead of DF to avoid different encoders for different
     // spark versions for the compatibility issue.
     // 3.5+, Encoders.row(schema)
     // 3.5-, RowEncoder(schema)
-    val outRDD = dataset.asInstanceOf[Dataset[Row]].rdd.mapPartitions { rowIter =>
+    val outRDD = input.asInstanceOf[Dataset[Row]].rdd.mapPartitions { rowIter =>
       rowIter.grouped(inferBatchSize).flatMap { batchRow =>
         val features = batchRow.iterator.map(row => {
           if (!featureIsArray) {
@@ -648,7 +667,8 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML
         }
       }
     }
-    val output = dataset.sparkSession.createDataFrame(outRDD, schema)
+    val output = input.sparkSession.createDataFrame(outRDD, schema)
+      .drop(Utils.TMP_FEATURE_ARRAY_NAME)
 
     bBooster.unpersist(blocking = false)
     postTransform(output, pred).toDF()
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
index bf27c51f8a72..b6c0a7ae7617 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
@@ -260,13 +260,22 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
   protected[spark] def featureIsArrayType(schema: StructType): Boolean =
     schema(getFeaturesCol).dataType.isInstanceOf[ArrayType]
 
-  protected[spark] def validateFeatureType(schema: StructType) = {
-    // Features cols must be Vector or Array.
-    val featureDataType = schema(getFeaturesCol).dataType
-
-    // Features column must be either ArrayType or VectorType.
-    if (!featureDataType.isInstanceOf[ArrayType] && !SparkUtils.isVectorType(featureDataType)) {
-      throw new IllegalArgumentException("Feature type must be either ArrayType or VectorType")
+  protected[spark] def validateFeatureType(schema: StructType): Unit = {
+    // If featuresCols is not set, need to check featuresCol which must be Vector or Array
+    if (!isSet(featuresCols)) {
+      // Features cols must be Vector or Array.
+      val featureDataType = schema(getFeaturesCol).dataType
+
+      // Features column must be either ArrayType or VectorType.
+      if (!featureDataType.isInstanceOf[ArrayType] && !SparkUtils.isVectorType(featureDataType)) {
+        throw new IllegalArgumentException("Feature type must be either ArrayType or VectorType")
+      }
+    } else {
+      // To check columns must be numeric type
+      require(getFeaturesCols.length > 0)
+      for (c <- getFeaturesCols) {
+        SparkUtils.checkNumericType(schema, c)
+      }
     }
   }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
index 2c6ee9c51d98..8ff9839be7ee 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
@@ -103,6 +103,15 @@ trait PerTest extends BeforeAndAfterEach {
     }
   }
 
+  def smallBinaryClassificationColumnar: DataFrame = ss.createDataFrame(sc.parallelize(Seq(
+    (1.0, 2.0, 3.0, 1.0),
+    (0.0, 0.0, 0.0, 0.0),
+    (0.0, 3.0, 0.0, 0.0),
+    (2.0, 0.0, 4.0, 1.0),
+    (0.2, 1.2, 2.0, 0.0),
+    (0.5, 2.2, 1.7, 1.0)
+  ))).toDF("c1", "c2", "c3", "label")
+
   def smallBinaryClassificationVector: DataFrame = ss.createDataFrame(sc.parallelize(Seq(
     (1.0, 0.5, 1.0, Vectors.dense(1.0, 2.0, 3.0)),
     (0.0, 0.4, -3.0, Vectors.dense(0.0, 0.0, 0.0)),
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
index 9b52d286c2d7..455001096c21 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.SparkException
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.xgboost.SparkUtils
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType}
@@ -556,6 +557,109 @@ class XGBoostEstimatorSuite extends AnyFunSuite with PerTest with TmpFolderPerSu
     exception.getMessage.contains("SoftmaxMultiClassObj: label must be in [0, num_class).")
   }
 
+  test("Model trained on vector can transform on array/columnar input") {
+    val vectorDf = smallBinaryClassificationVector
+    val classifier = new XGBoostClassifier().setNumRound(2)
+
+    // The model is trained with vector as the input
+    val model = classifier.fit(vectorDf)
+
+    val columnarDf = smallBinaryClassificationColumnar
+
+    // Model is trained with vector input, it doesn't have columnar input information
+    val thrown = intercept[IllegalArgumentException] {
+      model.transform(columnarDf).collect()
+    }
+    assert(thrown.getMessage.contains("features does not exist"))
+
+    // Transform on columnar input
+    model.copy(ParamMap.empty)
+      .setFeaturesCol(Array("c1", "c2", "c3"))
+      .transform(columnarDf)
+      .collect()
+
+    // Transform on array input
+    val arrayDf = smallBinaryClassificationArray
+    model.copy(ParamMap.empty).transform(arrayDf).collect()
+  }
+
+  test("Model trained on array can transform on vector/columnar input") {
+    val arrayDf = smallBinaryClassificationArray
+    val classifier = new XGBoostClassifier().setNumRound(2)
+
+    // The model is trained with vector as the input
+    val model = classifier.fit(arrayDf)
+    val columnarDf = smallBinaryClassificationColumnar
+
+    // Model is trained with vector input, it doesn't have columnar input information
+    val thrown = intercept[IllegalArgumentException] {
+      model.transform(columnarDf).collect()
+    }
+    assert(thrown.getMessage.contains("features does not exist"))
+
+    // Transform on columnar input
+    model.copy(ParamMap.empty)
+      .setFeaturesCol(Array("c1", "c2", "c3"))
+      .transform(columnarDf)
+      .collect()
+
+    // Transform on vector input
+    val vectorDf = smallBinaryClassificationVector
+    model.copy(ParamMap.empty).transform(vectorDf).collect()
+  }
+
+  test("Model trained on columnar can transform on array/vector input") {
+    val columnarDf = smallBinaryClassificationColumnar
+    val features = Array("c1", "c2", "c3")
+    val classifier = new XGBoostClassifier().setNumRound(2).setFeaturesCol(features)
+    // The model is trained with vector as the input
+    val model = classifier.fit(columnarDf)
+
+    // Transform on vector df
+    val vectorDf = smallBinaryClassificationVector
+    model.transform(vectorDf).collect()
+
+    // Transform on array df
+    val arrayDf = smallBinaryClassificationArray
+    model.transform(arrayDf).collect()
+  }
+
+  test("Fit and transform with columnar input") {
+    val df = smallBinaryClassificationColumnar
+
+    val estimator = new XGBoostClassifier()
+      .setFeaturesCol(Array("c1", "c2", "c3"))
+      .setNumRound(1)
+
+    // without any issue
+    val model = estimator.fit(df)
+    assert(model.getFeaturesCols sameElements Array("c1", "c2", "c3"))
+
+    val transformedDF = model.transform(df)
+    assert(transformedDF.schema.names.contains("c1"))
+    assert(transformedDF.schema.names.contains("c2"))
+    assert(transformedDF.schema.names.contains("c3"))
+    assert(!transformedDF.schema.names.contains(Utils.TMP_FEATURE_ARRAY_NAME))
+  }
+
+  test("Support columnar") {
+    val df = smallBinaryClassificationColumnar
+
+    val classifier = new XGBoostClassifier().setFeaturesCol(Array("c1", "c2", "c3"))
+    assert(classifier.getFeaturesCols sameElements Array("c1", "c2", "c3"))
+
+    val (processed, _) = classifier.preprocess(df)
+    assert(!processed.schema.contains("c1"))
+    assert(!processed.schema.contains("c2"))
+    assert(!processed.schema.contains("c3"))
+
+    val matched = processed.schema(Utils.TMP_FEATURE_ARRAY_NAME).dataType match {
+      case ArrayType(FloatType, _) => true
+      case _ => false
+    }
+    assert(matched)
+  }
+
   test("Support array(float)") {
     val df = smallBinaryClassificationArray
     val matched = df.schema("features").dataType match {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
index 063836538931..90aa368e13ad 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
@@ -140,11 +140,12 @@ class XGBoostRankerSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite
       .setGroupCol("group")
 
     val (df, _) = ranker.preprocess(trainingDF)
+    val groupId = df.schema.fieldIndex("group")
     df.rdd.foreachPartition { iter => {
       var prevGroup = Int.MinValue
       while (iter.hasNext) {
         val curr = iter.next()
-        val group = curr.asInstanceOf[Row].getAs[Int](2)
+        val group = curr.asInstanceOf[Row].getAs[Int](groupId)
         assert(prevGroup <= group)
         prevGroup = group
       }

From d5d66b36fdcb21d5af70e22f54eab19bc9069d73 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 26 Mar 2025 17:02:47 +0800
Subject: [PATCH 017/224] Small cleanup for GPU lambda rank. (#11342)

---
 src/objective/lambdarank_obj.cu            | 8 +++-----
 src/objective/lambdarank_obj.cuh           | 1 -
 tests/cpp/objective/test_lambdarank_obj.cu | 1 -
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 8e4dc8c36252..ecf826e4d1d4 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -96,7 +96,6 @@ struct GetGradOp {
     auto g_predt = args.predts.subspan(data_group_begin, n_data);
     auto g_gpair = args.gpairs.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
     auto g_rank = args.d_sorted_idx.subspan(data_group_begin, n_data);
-    auto n_pairs = args.n_pairs;
 
     auto [i, j] = make_pair(idx, g);
 
@@ -341,10 +340,9 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
     d_y_sorted_idx = SortY(ctx, info, rank_idx, p_cache);
   }
 
-  auto n_pairs = p_cache->Param().NumPair();
-  KernelInputs args{ti_plus,  tj_minus,       li,     lj,     d_gptr,     d_threads_group_ptr,
-                    rank_idx, label,          predts, gpairs, d_rounding, d_cost_rounding.data(),
-                    n_pairs,  d_y_sorted_idx, iter};
+  KernelInputs args{ti_plus,        tj_minus, li,     lj,     d_gptr,     d_threads_group_ptr,
+                    rank_idx,       label,    predts, gpairs, d_rounding, d_cost_rounding.data(),
+                    d_y_sorted_idx, iter};
 
   // dispatch based on unbiased and truncation
   if (p_cache->Param().HasTruncation()) {
diff --git a/src/objective/lambdarank_obj.cuh b/src/objective/lambdarank_obj.cuh
index ce95304197da..8e94befa2462 100644
--- a/src/objective/lambdarank_obj.cuh
+++ b/src/objective/lambdarank_obj.cuh
@@ -66,7 +66,6 @@ struct KernelInputs {
   linalg::VectorView<GradientPair const> d_roundings;
   double const *d_cost_rounding;
 
-  ltr::position_t const n_pairs;
   common::Span<std::size_t const> d_y_sorted_idx;
 
   std::int32_t iter;
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index d33273678662..c80ec20fc63d 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -55,7 +55,6 @@ void TestGPUMakePair() {
         linalg::MatrixView<GradientPair>{common::Span<GradientPair>{}, {0}, DeviceOrd::CUDA(0)},
         dg,
         nullptr,
-        1,
         y_sorted_idx,
         0};
     return args;

From 234236e0f6c9969c759504dd0de7405aef666e83 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 26 Mar 2025 19:33:59 +0800
Subject: [PATCH 018/224] Set minimum version of pyspark to 3.4. (#11364)

---
 python-package/pyproject.toml.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/pyproject.toml.in b/python-package/pyproject.toml.in
index 035e13a68227..a36ff30791e5 100644
--- a/python-package/pyproject.toml.in
+++ b/python-package/pyproject.toml.in
@@ -44,7 +44,7 @@ pandas = ["pandas>=1.2"]
 scikit-learn = ["scikit-learn"]
 dask = ["dask", "pandas", "distributed"]
 plotting = ["graphviz", "matplotlib"]
-pyspark = ["pyspark", "scikit-learn", "cloudpickle"]
+pyspark = ["pyspark>=3.4", "scikit-learn", "cloudpickle"]
 
 [tool.hatch.build.targets.wheel.hooks.custom]
 

From c5976198c5212d6e5e6dc295bd594ffec531917b Mon Sep 17 00:00:00 2001
From: Giacomo Amante <amantegiacomo@gmail.com>
Date: Wed, 26 Mar 2025 18:29:54 +0100
Subject: [PATCH 019/224] fix compilation on android older than API 26 (#11366)

Co-authored-by: Giacomo Amante <giacomo@scandit.com>
---
 src/common/threading_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc
index 52e5d0c6871b..430d5aaae6e1 100644
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -130,7 +130,7 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) noexcept(true) {
 }
 
 void NameThread(std::thread* t, StringView name) {
-#if defined(__linux__)
+#if defined(__linux__) && (!defined(__ANDROID__) || __ANDROID_API__ >= 26)
   auto handle = t->native_handle();
   char old[16];
   auto ret = pthread_getname_np(handle, old, 16);

From 3e819a76b9d23ebeb495acf75b2516bdabd9a01e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 27 Mar 2025 03:41:52 +0800
Subject: [PATCH 020/224] Split up grid size of different kernel
 configurations. (#11367)

---
 src/tree/gpu_hist/histogram.cu | 41 +++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index c4a4b9fcce9d..710c25a5f088 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -294,19 +294,25 @@ template <auto GlobalDense = SharedMemHistKernel<true, false, kBlockThreads, kIt
           auto SharedDense = SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>,
           auto Shared = SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>>
 struct HistogramKernel {
+  enum KernelType : std::size_t {
+    kGlobalDense = 0,
+    kGlobal = 1,
+    kSharedDense = 2,
+    kShared = 3,
+  };
   // Kernel for working with dense Ellpack using the global memory.
-  decltype(Global) global_dense_kernel{
+  decltype(GlobalDense) global_dense_kernel{
       SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the global memory.
   decltype(Global) global_kernel{SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with dense Ellpack using the shared memory.
-  decltype(Shared) shared_dense_kernel{
+  decltype(SharedDense) shared_dense_kernel{
       SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the shared memory.
   decltype(Shared) shared_kernel{SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>};
 
   bool shared{false};
-  std::uint32_t grid_size{0};
+  std::array<std::uint32_t, 4> grid_sizes{0, 0, 0, 0};
   std::size_t smem_size{0};
   bool const force_global;
 
@@ -321,7 +327,7 @@ struct HistogramKernel {
     this->shared = !force_global_memory && this->smem_size <= max_shared_memory;
     this->smem_size = this->shared ? this->smem_size : 0;
 
-    auto init = [&](auto& kernel) {
+    auto init = [&](auto& kernel, KernelType k) {
       if (this->shared) {
         dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                                            max_shared_memory));
@@ -338,11 +344,14 @@ struct HistogramKernel {
 
       // This gives the number of blocks to keep the device occupied Use this as the
       // maximum number of blocks
-      this->grid_size = n_blocks_per_mp * n_mps;
+      this->grid_sizes[static_cast<std::size_t>(k)] = n_blocks_per_mp * n_mps;
     };
     // Initialize all kernel instantiations
+    std::array kernel_types{kGlobalDense, kGlobal, kSharedDense, kShared};
+    std::int32_t k = 0;
     for (auto& kernel : {global_dense_kernel, global_kernel, shared_dense_kernel, shared_kernel}) {
-      init(kernel);
+      init(kernel, kernel_types[k]);
+      ++k;
     }
   }
 };
@@ -375,29 +384,35 @@ class DeviceHistogramBuilderImpl {
     // Allocate number of blocks such that each block has about kMinItemsPerBlock work
     // Up to a maximum where the device is saturated
     auto constexpr kMinItemsPerBlock = ItemsPerTile();
-    auto grid_size = std::min(kernel_->grid_size, static_cast<std::uint32_t>(common::DivRoundUp(
-                                                      items_per_group, kMinItemsPerBlock)));
-    auto launcher = [&](auto kernel) {
+
+    auto launcher = [&](auto const& kernel, std::uint32_t grid_size) {
+      CHECK_NE(grid_size, 0);
+      grid_size = std::min(grid_size, static_cast<std::uint32_t>(
+                                          common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
       dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
                        static_cast<uint32_t>(kBlockThreads), kernel_->smem_size, ctx->Stream()}(
           kernel, matrix, feature_groups, d_ridx, histogram.data(), gpair.data(), rounding);
     };
 
+    using K = HistogramKernel<>::KernelType;
     if (!this->kernel_->shared) {  // Use global memory
       CHECK_EQ(this->kernel_->smem_size, 0);
       if (matrix.IsDenseCompressed()) {
         // Dense must use shared memory except for testing.
         CHECK(this->kernel_->force_global);
-        launcher(this->kernel_->global_dense_kernel);
+        launcher(this->kernel_->global_dense_kernel, this->kernel_->grid_sizes[K::kGlobalDense]);
       } else {
-        launcher(this->kernel_->global_kernel);
+        // Sparse
+        launcher(this->kernel_->global_kernel, this->kernel_->grid_sizes[K::kGlobal]);
       }
     } else {  // Use shared memory
       CHECK_NE(this->kernel_->smem_size, 0);
       if (matrix.IsDenseCompressed()) {
-        launcher(this->kernel_->shared_dense_kernel);
+        // Dense
+        launcher(this->kernel_->shared_dense_kernel, this->kernel_->grid_sizes[K::kSharedDense]);
       } else {
-        launcher(this->kernel_->shared_kernel);
+        // Sparse
+        launcher(this->kernel_->shared_kernel, this->kernel_->grid_sizes[K::kShared]);
       }
     }
   }

From af2a0b63b10e3bed26fd43aa93cea6efa4e1980a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 27 Mar 2025 06:01:21 +0800
Subject: [PATCH 021/224] Enable r-universe build. (#11365) (#11368)

---
 R-package/LICENSE       | 13 -------------
 R-package/bootstrap.R   | 36 ++++++++++++++++++++++++++++++++++++
 doc/R-package/index.rst | 13 ++++++++++---
 doc/install.rst         | 40 ++++++++++++++++++++++++++++++++--------
 4 files changed, 78 insertions(+), 24 deletions(-)
 delete mode 100644 R-package/LICENSE
 create mode 100644 R-package/bootstrap.R

diff --git a/R-package/LICENSE b/R-package/LICENSE
deleted file mode 100644
index bc1c21d59fe5..000000000000
--- a/R-package/LICENSE
+++ /dev/null
@@ -1,13 +0,0 @@
-Copyright (c) 2014-2023, Tianqi Chen and XBGoost Contributors
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
diff --git a/R-package/bootstrap.R b/R-package/bootstrap.R
new file mode 100644
index 000000000000..dfafca67085f
--- /dev/null
+++ b/R-package/bootstrap.R
@@ -0,0 +1,36 @@
+## Script used to bootstrap R-universe build.
+
+## Execute git commands to initialize git submodules
+system("git submodule init")
+system("git submodule update")
+
+## core
+file.copy("../src", "./src/", recursive = TRUE)
+file.copy("../include", "./src/", recursive = TRUE)
+file.copy("../amalgamation", "./src/", recursive = TRUE)
+
+## dmlc-core
+dir.create("./src/dmlc-core")
+file.copy("../dmlc-core/include", "./src/dmlc-core/", recursive = TRUE)
+file.copy("../dmlc-core/src", "./src/dmlc-core/", recursive = TRUE)
+
+pkgroot <- function(path) {
+  ## read the file from path, replace the PKGROOT=../../ with PKGROOT=.
+  lines <- readLines(path)
+  lines <- gsub("PKGROOT=../../", "PKGROOT=.", lines, fixed = TRUE)
+  writeLines(lines, path)
+}
+
+## makefile and license
+file.copy("../LICENSE", "./LICENSE")
+pkgroot("./src/Makevars.in")
+pkgroot("./src/Makevars.win.in")
+
+## misc
+path <- file.path("remove_warning_suppression_pragma.sh")
+file.remove(path)
+path <- file.path("CMakeLists.txt")
+file.remove(path)
+
+## remove the directory recursively ./tests/helper_scripts
+unlink("tests/helper_scripts", recursive = TRUE)
diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst
index 30b6f83e5232..56e2db4591b6 100644
--- a/doc/R-package/index.rst
+++ b/doc/R-package/index.rst
@@ -17,9 +17,16 @@ You have found the XGBoost R Package!
 Get Started
 ***********
 
-* Checkout the :doc:`Installation Guide </install>` contains instructions to install xgboost, and :doc:`Tutorials </tutorials/index>` for examples on how to use XGBoost for various tasks.
-* Read the latest `API documentation <../r_docs/R-package/docs/reference/index.html>`__ . This might refer to a newer version than the one on CRAN.
-* Read the `CRAN documentation <https://cran.r-project.org/web/packages/xgboost/xgboost.pdf>`_.
+Since XGBoost 3.0.0, the latest R package is available on
+`https://dmlc.r-universe.dev/builds <R-universe>`__ while the one on CRAN is kept at an
+older version. We will work on helping the CRAN version to catch up in the future. In the
+meantime, please use R-universe packages.
+
+* Check out the :doc:`Installation Guide </install>` for instructions on how to install
+  xgboost, and :doc:`Tutorials </tutorials/index>` for examples on how to use XGBoost for
+  various tasks.
+* Read the latest `API documentation <../r_docs/R-package/docs/reference/index.html>`__.
+* Read the `CRAN documentation <https://cran.r-project.org/web/packages/xgboost/xgboost.pdf>`_. (outdated)
 
 *********
 Vignettes
diff --git a/doc/install.rst b/doc/install.rst
index 9a7330c8b428..527ec061bf15 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -127,22 +127,22 @@ Visit the `Miniconda website <https://docs.conda.io/en/latest/miniconda.html>`_
 R
 -
 
-* From CRAN:
+* From R Universe
 
-  .. code-block:: R
+.. code-block:: R
 
-    install.packages("xgboost")
+    install.packages('xgboost', repos = c('https://dmlc.r-universe.dev/', 'https://cloud.r-project.org/'))
 
-  .. note:: Using all CPU cores (threads) on Mac OSX
+.. note:: Using all CPU cores (threads) on Mac OSX
 
-     If you are using Mac OSX, you should first install OpenMP library (``libomp``) by running
+   If you are using Mac OSX, you should first install OpenMP library (``libomp``) by running
 
-     .. code-block:: bash
+   .. code-block:: bash
 
         brew install libomp
 
-     and then run ``install.packages("xgboost")``. Without OpenMP, XGBoost will only use a
-     single CPU core, leading to suboptimal training speed.
+   and then run ``install.packages("xgboost")``. Without OpenMP, XGBoost will only use a
+   single CPU core, leading to suboptimal training speed.
 
 * We also provide **experimental** pre-built binary with GPU support. With this binary,
   you will be able to use the GPU algorithm without building XGBoost from the source.
@@ -158,6 +158,30 @@ R
     # Install XGBoost
     R CMD INSTALL ./xgboost_r_gpu_linux.tar.gz
 
+
+* From CRAN (outdated):
+
+.. warning::
+
+    We are working on bringing the CRAN version of XGBoost up-to-date, in the meantime,
+    please use packages from the R-universe.
+
+
+.. code-block:: R
+
+    install.packages("xgboost")
+
+.. note:: Using all CPU cores (threads) on Mac OSX
+
+   If you are using Mac OSX, you should first install OpenMP library (``libomp``) by running
+
+   .. code-block:: bash
+
+        brew install libomp
+
+   and then run ``install.packages("xgboost")``. Without OpenMP, XGBoost will only use a
+   single CPU core, leading to suboptimal training speed.
+
 JVM
 ---
 

From 5fbab40988973a7c9e1161fa0dea92d38ac3885a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 27 Mar 2025 14:57:36 +0800
Subject: [PATCH 022/224] [doc] Update release note for R-universe. (#11369)

---
 doc/R-package/index.rst |  6 +++---
 doc/changes/v3.0.0.rst  | 14 +++++++++-----
 doc/contrib/release.rst | 22 ++++++++++++++++++++--
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst
index 56e2db4591b6..0bdcf2dcdc76 100644
--- a/doc/R-package/index.rst
+++ b/doc/R-package/index.rst
@@ -17,9 +17,9 @@ You have found the XGBoost R Package!
 Get Started
 ***********
 
-Since XGBoost 3.0.0, the latest R package is available on
-`https://dmlc.r-universe.dev/builds <R-universe>`__ while the one on CRAN is kept at an
-older version. We will work on helping the CRAN version to catch up in the future. In the
+Since XGBoost 3.0.0, the latest R package is available on `R-universe
+<https://dmlc.r-universe.dev/xgboost>`__ while the one on CRAN is kept at an older
+version. We will work on helping the CRAN version to catch up in the future. In the
 meantime, please use R-universe packages.
 
 * Check out the :doc:`Installation Guide </install>` for instructions on how to install
diff --git a/doc/changes/v3.0.0.rst b/doc/changes/v3.0.0.rst
index bc1722ad4f2a..58f7cf8654fd 100644
--- a/doc/changes/v3.0.0.rst
+++ b/doc/changes/v3.0.0.rst
@@ -292,11 +292,15 @@ R Package
 *********
 
 We have been reworking the R package for a few releases now. In 3.0, we will start
-publishing a new R package on public repositories, likely R-universe, before moving toward
-a CRAN update. The new package features a much more ergonomic interface, which is also
-more idiomatic to R speakers. In addition, a range of new features are introduced to the
-package. To name a few, the new package includes categorical feature support,
-``QuantileDMatrix``, and an initial implementation of the external memory training.
+publishing a new R package on R-universe, before moving toward a CRAN update. The new
+package features a much more ergonomic interface, which is also more idiomatic to R
+speakers. In addition, a range of new features are introduced to the package. To name a
+few, the new package includes categorical feature support, ``QuantileDMatrix``, and an
+initial implementation of the external memory training. To test the new package:
+
+.. code-block:: R
+
+  install.packages('xgboost', repos = c('https://dmlc.r-universe.dev/', 'https://cloud.r-project.org/'))
 
 Also, we finally have an online documentation site for the R package featuring both
 vignettes and API references (:pr:`11166`, :pr:`11257`). A good starting point for the new interface
diff --git a/doc/contrib/release.rst b/doc/contrib/release.rst
index 4548b1ffa9a2..25f291b1213d 100644
--- a/doc/contrib/release.rst
+++ b/doc/contrib/release.rst
@@ -21,17 +21,25 @@ Making a Release
 3. Commit the change, create a PR on GitHub on release branch.  Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``.
 4. Create a tag on release branch, either on GitHub or locally.
 5. Make a release on GitHub tag page, which might be done with previous step if the tag is created on GitHub.
-6. Submit pip, CRAN, and Maven packages.
+6. Submit pip, R-universe, CRAN, and Maven packages.
 
    There are helper scripts for automating the process in ``xgboost/dev/``.
 
    + The pip package is maintained by `Hyunsu Cho <https://github.com/hcho3>`__ and `Jiaming Yuan <https://github.com/trivialfis>`__.
 
-   + The CRAN package is maintained by `Tong He <https://github.com/hetong007>`_ and `Jiaming Yuan <https://github.com/trivialfis>`__.
+   + The CRAN package and the R-universe packages are maintained by `Jiaming Yuan <https://github.com/trivialfis>`__.
 
    + The Maven package is maintained by `Nan Zhu <https://github.com/CodingCat>`_ and `Hyunsu Cho <https://github.com/hcho3>`_.
 
 
+R Universe Packages
+-------------------
+
+Since XGBoost 3.0.0, we host the R package on `R-Universe
+<https://dmlc.r-universe.dev/xgboost>`__. To make a new release, change the
+``packages.json`` in `dmlc.r-universe.dev <https://github.com/dmlc/dmlc.r-universe.dev>`__
+with a new release branch.
+
 R CRAN Package
 --------------
 Before submitting a release, one should test the package on `R-hub <https://builder.r-hub.io/>`__ and `win-builder <https://win-builder.r-project.org/>`__ first.  Please note that the R-hub Windows instance doesn't have the exact same environment as the one hosted on win-builder.
@@ -42,7 +50,17 @@ According to the `CRAN policy <https://cran.r-project.org/web/packages/policies.
 
 We need to check the number of CPUs used in examples. Export ``_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_=2.5`` before running ``R CMD check --as-cran`` `[1] <#references>`__ and make sure the machine you are using has enough CPU cores to reveal any potential policy violation.
 
+Read The Docs
+-------------
+
+We might need to manually activate the new release branch for `read the docs
+<https://xgboost.readthedocs.io/>`__ and set it as the default branch in the console `[2]
+<#references>`__. Please check the document build and make sure the correct branch is
+activated and selected after making a new release.
+
 References
 ----------
 
 [1] https://stat.ethz.ch/pipermail/r-package-devel/2022q4/008610.html
+
+[2] https://github.com/readthedocs/readthedocs.org/issues/12073
\ No newline at end of file

From c431916d3ee840e80bbc084d4ebcbbaf0eda8c87 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 2 Apr 2025 13:55:23 +0800
Subject: [PATCH 023/224] [CI] Support CMake 4.0 (#11382)

* [CI] Use gtest from conda.

* check version.

* don't use conda gtest.

* debug.

* Try using dmlc/dmlc-core#690

* Add back aarch64_test.yml

* Use C++14 for gtest

* Require C++14 for gtest when found by find_package

* Require C++14 everywhere in dmlc-core

* Lower gtest requirement to 1.14

* Update dmlc-core

---------

Co-authored-by: Hyunsu Cho <phcho@nvidia.com>
---
 dmlc-core                            | 2 +-
 ops/conda_env/linux_cpu_test.yml     | 2 +-
 ops/conda_env/linux_sycl_test.yml    | 2 +-
 ops/pipeline/build-cpu-arm64-impl.sh | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dmlc-core b/dmlc-core
index 133418575498..8986b0598df7 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 13341857549852a9a86b1894b5ba84c6276ab381
+Subproject commit 8986b0598df709117570984571476c3614f55724
diff --git a/ops/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml
index 55bac17f2dbb..51d68e82c1bd 100644
--- a/ops/conda_env/linux_cpu_test.yml
+++ b/ops/conda_env/linux_cpu_test.yml
@@ -3,7 +3,7 @@ channels:
 - conda-forge
 dependencies:
 - python=3.10
-- cmake
+- cmake>=3.26.4
 - c-compiler
 - cxx-compiler
 - ninja
diff --git a/ops/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml
index 1761787662ee..144f3ec3328d 100644
--- a/ops/conda_env/linux_sycl_test.yml
+++ b/ops/conda_env/linux_sycl_test.yml
@@ -4,7 +4,7 @@ channels:
 - https://software.repos.intel.com/python/conda/
 dependencies:
 - python=3.10
-- cmake
+- cmake>=3.26.4
 - c-compiler
 - cxx-compiler
 - pip
diff --git a/ops/pipeline/build-cpu-arm64-impl.sh b/ops/pipeline/build-cpu-arm64-impl.sh
index ae0aa7d5b4ce..5cac6c259962 100755
--- a/ops/pipeline/build-cpu-arm64-impl.sh
+++ b/ops/pipeline/build-cpu-arm64-impl.sh
@@ -9,6 +9,7 @@ source activate aarch64_test
 echo "--- Build libxgboost from the source"
 mkdir -p build
 pushd build
+
 cmake .. \
   -GNinja \
   -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \

From 850d15a3dec0b0686076f8999657da31be4d3011 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 8 Apr 2025 02:43:18 +0800
Subject: [PATCH 024/224] Fix mypy errors. (#11389)

---
 python-package/xgboost/dask/__init__.py                   | 8 ++++----
 .../test_gpu_with_dask/test_gpu_with_dask.py              | 4 +---
 tests/test_distributed/test_with_dask/test_with_dask.py   | 8 ++++----
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 6db30fbf5c3d..d1a5c7f27094 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -55,7 +55,7 @@
 import logging
 from collections import defaultdict
 from contextlib import contextmanager
-from functools import partial, update_wrapper, wraps
+from functools import partial, update_wrapper
 from threading import Thread
 from typing import (
     Any,
@@ -354,7 +354,7 @@ def __init__(
             label_upper_bound=label_upper_bound,
         )
 
-    def __await__(self) -> Generator:
+    def __await__(self) -> Generator[None, None, "DaskDMatrix"]:
         return self._init.__await__()
 
     async def _map_local_data(
@@ -1490,7 +1490,7 @@ async def _predict_async(
             if isinstance(predts, dd.DataFrame):
                 predts = predts.to_dask_array()
         else:
-            test_dmatrix: DaskDMatrix = await DaskDMatrix(  # type: ignore
+            test_dmatrix: DaskDMatrix = await DaskDMatrix(
                 self.client,
                 data=data,
                 base_margin=base_margin,
@@ -1532,7 +1532,7 @@ async def _apply_async(
         iteration_range: Optional[IterationRange] = None,
     ) -> Any:
         iteration_range = self._get_iteration_range(iteration_range)
-        test_dmatrix: DaskDMatrix = await DaskDMatrix(  # type: ignore
+        test_dmatrix: DaskDMatrix = await DaskDMatrix(
             self.client,
             data=X,
             missing=self.missing,
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 19f0eddaf6c7..3f96f7f962f3 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -670,9 +670,7 @@ async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainRetur
         X = X.to_backend("cupy")
         y = y.to_backend("cupy")
 
-        m: dxgb.DaskDMatrix = await dxgb.DaskQuantileDMatrix(
-            client, X, y
-        )  # type:ignore
+        m: dxgb.DaskDMatrix = await dxgb.DaskQuantileDMatrix(client, X, y)
         output = await dxgb.train(
             client, {"tree_method": "hist", "device": "cuda"}, dtrain=m
         )
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 0eccf1f46c67..74eda37854a8 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -953,7 +953,7 @@ def test_empty_dmatrix(tree_method) -> None:
 async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
     async with Client(scheduler_address, asynchronous=True) as client:
         X, y, _ = generate_array()
-        m = await DaskDMatrix(client, X, y)  # type: ignore
+        m = await DaskDMatrix(client, X, y)
         output = await dxgb.train(client, {}, dtrain=m)
 
         with_m = await dxgb.predict(client, output, m)
@@ -1058,8 +1058,8 @@ async def train() -> None:
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
                 X, y, w = generate_array(with_weights=True)
-                dtrain = await DaskDMatrix(client, X, y, weight=w)  # type: ignore
-                dvalid = await DaskDMatrix(client, X, y, weight=w)  # type: ignore
+                dtrain = await DaskDMatrix(client, X, y, weight=w)
+                dvalid = await DaskDMatrix(client, X, y, weight=w)
                 output = await dxgb.train(client, {}, dtrain=dtrain)
                 await dxgb.predict(client, output, data=dvalid)
 
@@ -2195,7 +2195,7 @@ async def test_worker_left(c: Client, s: Scheduler, a: Worker, b: Worker):
     async with Worker(s.address):
         dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
         dy = da.random.random((1000,)).rechunk(chunks=(10,))
-        d_train = await dxgb.DaskDMatrix(  # type: ignore
+        d_train = await dxgb.DaskDMatrix(
             c,
             dx,
             dy,

From 338fd19d3a0afdd7c8f8f04ef21b395fe93d26a4 Mon Sep 17 00:00:00 2001
From: Austin Morton <apmorton@users.noreply.github.com>
Date: Mon, 7 Apr 2025 14:06:46 -0500
Subject: [PATCH 025/224] Clear CUDA error before pointer check (#11386)

---
 src/data/array_interface.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/data/array_interface.cu b/src/data/array_interface.cu
index 28d8945c2ac3..5be1d0b77420 100644
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@@ -38,6 +38,8 @@ bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
   if (!ptr) {
     return false;
   }
+  // clear potentially pre-existing/unrelated error
+  cudaGetLastError();
   cudaPointerAttributes attr;
   auto err = cudaPointerGetAttributes(&attr, ptr);
   // reset error

From 6a1302cd3ce7bbc06fef0ffaa476e06ba465652c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 8 Apr 2025 15:38:29 +0800
Subject: [PATCH 026/224] [EM] Optimization for deep trees. (#11387)

- Decouple the row partition batch size from the driver batch size. This will allow us to process more nodes for each data batch.
- Pick a heuristic to use ATS instead of data copy to handle cases where we have a large number of small nodes.
- Make sure a new page that happens to be the last is placed on the host.
---
 src/data/ellpack_page_source.cu       |  11 +--
 src/tree/gpu_hist/row_partitioner.cuh | 121 +++++++++++++++++---------
 src/tree/updater_gpu_hist.cu          |  31 +++++--
 3 files changed, 111 insertions(+), 52 deletions(-)

diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index cd99de0d38b0..f7cc58a12c1f 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -190,11 +190,12 @@ class EllpackHostCacheStreamImpl {
       auto& new_impl = this->cache_->pages.back();
       auto offset = new_impl->Copy(&ctx, impl, this->cache_->offsets.back());
       this->cache_->offsets.back() += offset;
-      // No need to copy if it's already in device.
-      if (last_page && !this->cache_->on_device.back()) {
-        auto commited = commit_host_page(this->cache_->pages.back().get());
-        this->cache_->pages.back() = std::move(commited);
-      }
+    }
+
+    // No need to copy if it's already in device.
+    if (last_page && !this->cache_->on_device.back()) {
+      auto commited = commit_host_page(this->cache_->pages.back().get());
+      this->cache_->pages.back() = std::move(commited);
     }
 
     return new_page;
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a23a443ed92e..56fee47061b6 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -1,15 +1,15 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #pragma once
-#include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>          // for make_counting_iterator
 #include <thrust/iterator/transform_output_iterator.h>  // for make_transform_output_iterator
 
-#include <algorithm>  // for max
-#include <cstddef>    // for size_t
-#include <cstdint>    // for int32_t, uint32_t
-#include <vector>     // for vector
+#include <algorithm>        // for max
+#include <cstddef>          // for size_t
+#include <cstdint>          // for int32_t, uint32_t
+#include <cuda/functional>  // for proclaim_return_type
+#include <vector>           // for vector
 
 #include "../../common/cuda_context.cuh"    // for CUDAContext
 #include "../../common/device_helpers.cuh"  // for MakeTransformIterator
@@ -21,7 +21,7 @@ namespace xgboost::tree {
 namespace cuda_impl {
 using RowIndexT = std::uint32_t;
 // TODO(Rory): Can be larger. To be tuned alongside other batch operations.
-static const std::int32_t kMaxUpdatePositionBatchSize = 32;
+inline constexpr std::int32_t kMaxUpdatePositionBatchSize = 32;
 }  // namespace cuda_impl
 
 /**
@@ -37,7 +37,7 @@ struct Segment {
   Segment(cuda_impl::RowIndexT begin, cuda_impl::RowIndexT end) : begin(begin), end(end) {
     CHECK_GE(end, begin);
   }
-  __host__ __device__ bst_idx_t Size() const { return end - begin; }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t Size() const { return end - begin; }
 };
 
 template <typename OpDataT>
@@ -46,28 +46,42 @@ struct PerNodeData {
   OpDataT data;
 };
 
-template <typename BatchIterT>
-XGBOOST_DEV_INLINE void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx,
-                                    int* batch_idx, std::size_t* item_idx) {
+/**
+ * @param global_thread_idx In practice, the row index within the total number of rows for
+ *        this node batch.
+ * @param batch_idx The nidx within this node batch (not the actual node index in a tree).
+ * @param item_idx The resulting global row index (without accounting for base_rowid). This maps the
+ *        row index within the node batch back to the global row index.
+ */
+template <typename T>
+XGBOOST_DEV_INLINE void AssignBatch(dh::LDGIterator<T> const& batch_info_iter,
+                                    std::size_t global_thread_idx, int* batch_idx,
+                                    std::size_t* item_idx) {
   cuda_impl::RowIndexT sum = 0;
-  for (int i = 0; i < cuda_impl::kMaxUpdatePositionBatchSize; i++) {
-    if (sum + batch_info[i].segment.Size() > global_thread_idx) {
+  // Search for the nidx in batch and the corresponding global row index, exit once found.
+  for (std::int32_t i = 0; i < cuda_impl::kMaxUpdatePositionBatchSize; i++) {
+    if (sum + batch_info_iter[i].segment.Size() > global_thread_idx) {
       *batch_idx = i;
-      *item_idx = (global_thread_idx - sum) + batch_info[i].segment.begin;
+      // the beginning of the segment plus the offset into that segment
+      *item_idx = (global_thread_idx - sum) + batch_info_iter[i].segment.begin;
       break;
     }
-    sum += batch_info[i].segment.Size();
+    sum += batch_info_iter[i].segment.Size();
   }
 }
 
+/**
+ * @param total_rows The total number of rows for this batch of nodes.
+ */
 template <int kBlockSize, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel(
-    dh::LDGIterator<PerNodeData<OpDataT>> batch_info, common::Span<cuda_impl::RowIndexT> d_ridx,
-    const common::Span<const cuda_impl::RowIndexT> ridx_tmp, bst_idx_t total_rows) {
+    dh::LDGIterator<PerNodeData<OpDataT>> batch_info_iter,
+    common::Span<cuda_impl::RowIndexT> d_ridx,
+    common::Span<cuda_impl::RowIndexT const> const ridx_tmp, bst_idx_t total_rows) {
   for (auto idx : dh::GridStrideRange<std::size_t>(0, total_rows)) {
-    int batch_idx;
-    std::size_t item_idx;
-    AssignBatch(batch_info, idx, &batch_idx, &item_idx);
+    std::int32_t batch_idx;  // unused
+    std::size_t item_idx = std::numeric_limits<std::size_t>::max();
+    AssignBatch(batch_info_iter, idx, &batch_idx, &item_idx);
     d_ridx[item_idx] = ridx_tmp[item_idx];
   }
 }
@@ -141,18 +155,22 @@ void SortPositionBatch(Context const* ctx, common::Span<const PerNodeData<OpData
   auto discard_write_iterator =
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
-  auto input_iterator =
-      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(std::size_t idx) {
-        int nidx_in_batch;
+  auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
+      counting, cuda::proclaim_return_type<IndexFlagTuple>([=] __device__(std::size_t idx) {
+        std::int32_t nidx_in_batch;
         std::size_t item_idx;
         AssignBatch(batch_info_itr, idx, &nidx_in_batch, &item_idx);
         auto go_left = op(ridx[item_idx], nidx_in_batch, batch_info_itr[nidx_in_batch].data);
         return IndexFlagTuple{static_cast<cuda_impl::RowIndexT>(item_idx), go_left, nidx_in_batch,
                               go_left};
-      });
-  // Avoid using int as the offset type
+      }));
+  // Reach down to the dispatch function to avoid using int as the offset type.
   std::size_t n_bytes = 0;
   if (tmp->empty()) {
+    // The size of temporary storage is calculated based on the total number of
+    // rows. Since the root node has all the rows, subsequence allocatioin must be smaller
+    // than the root node. As a result, we can calculate this once and reuse it throughout
+    // the iteration.
     auto ret =
         cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
                           cub::NullType, std::int64_t>::Dispatch(nullptr, n_bytes, input_iterator,
@@ -305,10 +323,10 @@ class RowPartitioner {
    * second. Returns true if this training instance goes on the left partition.
    */
   template <typename UpdatePositionOpT, typename OpDataT>
-  void UpdatePositionBatch(Context const* ctx, const std::vector<bst_node_t>& nidx,
-                           const std::vector<bst_node_t>& left_nidx,
-                           const std::vector<bst_node_t>& right_nidx,
-                           const std::vector<OpDataT>& op_data, UpdatePositionOpT op) {
+  void UpdatePositionBatch(Context const* ctx, std::vector<bst_node_t> const& nidx,
+                           std::vector<bst_node_t> const& left_nidx,
+                           std::vector<bst_node_t> const& right_nidx,
+                           std::vector<OpDataT> const& op_data, UpdatePositionOpT op) {
     if (nidx.empty()) {
       return;
     }
@@ -317,28 +335,47 @@ class RowPartitioner {
     CHECK_EQ(nidx.size(), right_nidx.size());
     CHECK_EQ(nidx.size(), op_data.size());
     this->n_nodes_ += (left_nidx.size() + right_nidx.size());
-
-    auto h_batch_info = pinned2_.GetSpan<PerNodeData<OpDataT>>(nidx.size());
+    common::Span<PerNodeData<OpDataT>> h_batch_info =
+        pinned2_.GetSpan<PerNodeData<OpDataT>>(nidx.size());
     dh::TemporaryArray<PerNodeData<OpDataT>> d_batch_info(nidx.size());
 
-    std::size_t total_rows = 0;
-    for (size_t i = 0; i < nidx.size(); i++) {
-      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)};
-      total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
+    for (std::size_t i = 0; i < nidx.size(); i++) {
+      h_batch_info[i] = {ridx_segments_.at(nidx[i]).segment, op_data[i]};
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
-                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
-
+                                  h_batch_info.size_bytes(), cudaMemcpyDefault,
+                                  ctx->CUDACtx()->Stream()));
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<RowIndexT>(nidx.size());
     // Must initialize with 0 as 0 count is not written in the kernel.
     dh::TemporaryArray<RowIndexT> d_counts(nidx.size(), 0);
 
-    // Partition the rows according to the operator
-    SortPositionBatch<UpdatePositionOpT, OpDataT>(ctx, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_),
-                                                  dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
-                                                  total_rows, op, &tmp_);
+    // Process a sub-batch
+    auto sub_batch_impl = [ctx, op, this](common::Span<bst_node_t const> nidx,
+                                          common::Span<PerNodeData<OpDataT>> d_batch_info,
+                                          common::Span<RowIndexT> d_counts) {
+      std::size_t total_rows = 0;
+      for (bst_node_t i : nidx) {
+        total_rows += this->ridx_segments_[i].segment.Size();
+      }
+
+      // Partition the rows according to the operator
+      SortPositionBatch<UpdatePositionOpT, OpDataT>(ctx, d_batch_info, dh::ToSpan(this->ridx_),
+                                                    dh::ToSpan(this->ridx_tmp_), d_counts,
+                                                    total_rows, op, &this->tmp_);
+    };
+
+    // Divide inputs into sub-batches.
+    for (std::size_t batch_begin = 0, n = nidx.size(); batch_begin < n;
+         batch_begin += cuda_impl::kMaxUpdatePositionBatchSize) {
+      auto constexpr kMax = static_cast<decltype(n)>(cuda_impl::kMaxUpdatePositionBatchSize);
+      auto batch_size = std::min(kMax, n - batch_begin);
+      auto nidx_batch = common::Span{nidx}.subspan(batch_begin, batch_size);
+      auto d_info_batch = dh::ToSpan(d_batch_info).subspan(batch_begin, batch_size);
+      auto d_counts_batch = dh::ToSpan(d_counts).subspan(batch_begin, batch_size);
+      sub_batch_impl(nidx_batch, d_info_batch, d_counts_batch);
+    }
+
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
                                   cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
     // TODO(Rory): this synchronisation hurts performance a lot
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5a6236b6ccfb..b2eaf987964b 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <thrust/functional.h>  // for plus
 #include <thrust/transform.h>   // for transform
@@ -12,7 +12,6 @@
 #include <vector>     // for vector
 
 #include "../collective/aggregator.h"
-#include "../collective/broadcast.h"   // for Broadcast
 #include "../common/categorical.h"     // for KCatBitField
 #include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/cuda_rt_utils.h"   // for CheckComputeCapability
@@ -53,6 +52,12 @@ using cuda_impl::ApproxBatch;
 using cuda_impl::HistBatch;
 using xgboost::cuda_impl::StaticBatch;
 
+namespace {
+// Use a large number to handle external memory with deep trees.
+inline constexpr std::size_t kMaxNodeBatchSize = 1024;
+inline constexpr std::size_t kNeedCopyThreshold = 4;
+}  // anonymous namespace
+
 // Extra data for each node that is passed to the update position function
 struct NodeSplitData {
   RegTree::Node split_node;
@@ -452,6 +457,23 @@ struct GPUHistMakerDevice {
     }
   };
 
+  // Heuristic to avoid copying the data batch.
+  [[nodiscard]] bool NeedCopy(DMatrix* p_fmat,
+                              std::vector<GPUExpandEntry> const& candidates) const {
+    if (p_fmat->SingleColBlock()) {
+      return true;  // use default if it's in-core
+    }
+    bst_idx_t n_total_samples = p_fmat->Info().num_row_;
+    bst_idx_t n_samples = 0;
+    for (auto const& c : candidates) {
+      for (auto const& part : this->partitioners_) {
+        n_samples += part->GetRows(c.nid).size();
+      }
+    }
+    // avoid copy if the kernel is small.
+    return n_samples * kNeedCopyThreshold > n_total_samples;
+  }
+
   // Update position and build histogram.
   void PartitionAndBuildHist(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& expand_set,
                              std::vector<GPUExpandEntry> const& candidates, RegTree const* p_tree) {
@@ -474,7 +496,7 @@ struct GPUHistMakerDevice {
     std::vector<bst_node_t> build_nidx(candidates.size());
     std::vector<bst_node_t> subtraction_nidx(candidates.size());
     AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
-    auto prefetch_copy = !build_nidx.empty();
+    auto prefetch_copy = !build_nidx.empty() && this->NeedCopy(p_fmat, candidates);
 
     this->histogram_.AllocateHistograms(ctx_, build_nidx, subtraction_nidx);
 
@@ -711,8 +733,7 @@ struct GPUHistMakerDevice {
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
-    // Process maximum 32 nodes at a time
-    Driver<GPUExpandEntry> driver(param, 32);
+    Driver<GPUExpandEntry> driver{param, kMaxNodeBatchSize};
 
     p_fmat = this->Reset(gpair_all, p_fmat);
     driver.Push({this->InitRoot(p_fmat, p_tree)});

From e23cfc196108d1eb85573d71ddc414ad403dee84 Mon Sep 17 00:00:00 2001
From: Giacomo Amante <amantegiacomo@gmail.com>
Date: Wed, 9 Apr 2025 09:08:53 +0200
Subject: [PATCH 027/224] rename param.hh/cc to hist_param.hh/cc to fix xcode
 build (#11378)

---------

Co-authored-by: Giacomo Amante <giacomo@scandit.com>
Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 R-package/src/Makevars.in                   | 2 +-
 R-package/src/Makevars.win.in               | 2 +-
 ops/conda_env/linux_sycl_test.yml           | 1 +
 ops/pipeline/build-test-sycl.sh             | 2 +-
 src/tree/hist/{param.cc => hist_param.cc}   | 4 ++--
 src/tree/hist/{param.h => hist_param.h}     | 2 +-
 src/tree/hist/histogram.h                   | 2 +-
 src/tree/updater_approx.cc                  | 2 +-
 src/tree/updater_gpu_hist.cu                | 2 +-
 src/tree/updater_quantile_hist.cc           | 2 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu   | 4 ++--
 tests/cpp/tree/hist/test_evaluate_splits.cc | 2 +-
 tests/cpp/tree/hist/test_histogram.cc       | 2 +-
 13 files changed, 15 insertions(+), 14 deletions(-)
 rename src/tree/hist/{param.cc => hist_param.cc} (95%)
 rename src/tree/hist/{param.h => hist_param.h} (97%)

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 67e7a86b0033..2d9ac3cd71b7 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -98,7 +98,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/updater_quantile_hist.o \
     $(PKGROOT)/src/tree/updater_refresh.o \
     $(PKGROOT)/src/tree/updater_sync.o \
-    $(PKGROOT)/src/tree/hist/param.o \
+    $(PKGROOT)/src/tree/hist/hist_param.o \
     $(PKGROOT)/src/tree/hist/histogram.o \
     $(PKGROOT)/src/linear/linear_updater.o \
     $(PKGROOT)/src/linear/updater_coordinate.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index d84cdb43ce6c..1c9a36149cf8 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -97,7 +97,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/updater_quantile_hist.o \
     $(PKGROOT)/src/tree/updater_refresh.o \
     $(PKGROOT)/src/tree/updater_sync.o \
-    $(PKGROOT)/src/tree/hist/param.o \
+    $(PKGROOT)/src/tree/hist/hist_param.o \
     $(PKGROOT)/src/tree/hist/histogram.o \
     $(PKGROOT)/src/linear/linear_updater.o \
     $(PKGROOT)/src/linear/updater_coordinate.o \
diff --git a/ops/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml
index 144f3ec3328d..528649586cfe 100644
--- a/ops/conda_env/linux_sycl_test.yml
+++ b/ops/conda_env/linux_sycl_test.yml
@@ -7,6 +7,7 @@ dependencies:
 - cmake>=3.26.4
 - c-compiler
 - cxx-compiler
+- gtest
 - pip
 - wheel
 - numpy
diff --git a/ops/pipeline/build-test-sycl.sh b/ops/pipeline/build-test-sycl.sh
index f3b651b18cf9..ee3a4f70049c 100755
--- a/ops/pipeline/build-test-sycl.sh
+++ b/ops/pipeline/build-test-sycl.sh
@@ -13,7 +13,7 @@ suite="$1"
 
 mkdir build
 pushd build
-cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \
+cmake .. -DGOOGLE_TEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \
   -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \
   -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja
 ninja
diff --git a/src/tree/hist/param.cc b/src/tree/hist/hist_param.cc
similarity index 95%
rename from src/tree/hist/param.cc
rename to src/tree/hist/hist_param.cc
index 10895d5111b8..7ca41edacd1a 100644
--- a/src/tree/hist/param.cc
+++ b/src/tree/hist/hist_param.cc
@@ -1,7 +1,7 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
-#include "param.h"
+#include "hist_param.h"
 
 #include <ios>     // for binary
 #include <string>  // for string
diff --git a/src/tree/hist/param.h b/src/tree/hist/hist_param.h
similarity index 97%
rename from src/tree/hist/param.h
rename to src/tree/hist/hist_param.h
index 53e79f0da2f7..184debf7da00 100644
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/hist_param.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #pragma once
 
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index fcfa03e039f7..441d1a01530a 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -17,7 +17,7 @@
 #include "../../data/gradient_index.h"     // for GHistIndexMatrix
 #include "expand_entry.h"                  // for MultiExpandEntry, CPUExpandEntry
 #include "hist_cache.h"                    // for BoundedHistCollection
-#include "param.h"                         // for HistMakerTrainParam
+#include "hist_param.h"                    // for HistMakerTrainParam
 #include "xgboost/base.h"                  // for bst_node_t, bst_target_t, bst_bin_t
 #include "xgboost/context.h"               // for Context
 #include "xgboost/data.h"                  // for BatchIterator, BatchSet
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index fa34e9829c2f..885150144b84 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -22,7 +22,7 @@
 #include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
 #include "hist/expand_entry.h"               // for CPUExpandEntry
 #include "hist/histogram.h"                  // for MultiHistogramBuilder
-#include "hist/param.h"                      // for HistMakerTrainParam
+#include "hist/hist_param.h"                 // for HistMakerTrainParam
 #include "hist/sampler.h"                    // for SampleGradient
 #include "param.h"                           // for GradStats, TrainParam
 #include "xgboost/base.h"                    // for Args, GradientPair, bst_node_t, bst_bin_t
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index b2eaf987964b..713bac468e90 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -31,7 +31,7 @@
 #include "gpu_hist/gradient_based_sampler.cuh"
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/row_partitioner.cuh"  // for RowPartitioner
-#include "hist/param.h"                  // for HistMakerTrainParam
+#include "hist/hist_param.h"             // for HistMakerTrainParam
 #include "param.h"                       // for TrainParam
 #include "sample_position.h"             // for SamplePosition
 #include "updater_gpu_common.cuh"        // for HistBatch
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 51b26b781148..2575302239e6 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -27,7 +27,7 @@
 #include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
 #include "hist/hist_cache.h"                 // for BoundedHistCollection
 #include "hist/histogram.h"                  // for MultiHistogramBuilder
-#include "hist/param.h"                      // for HistMakerTrainParam
+#include "hist/hist_param.h"                 // for HistMakerTrainParam
 #include "hist/sampler.h"                    // for SampleGradient
 #include "param.h"                           // for TrainParam, GradStats
 #include "xgboost/base.h"                    // for Args, GradientPairPrecise, GradientPair, Gra...
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index ce34c3d3b561..55d5ea2d27fb 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>  // for Context
@@ -9,7 +9,7 @@
 
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"  // for RowPartitioner
-#include "../../../../src/tree/hist/param.h"                  // for HistMakerTrainParam
+#include "../../../../src/tree/hist/hist_param.h"             // for HistMakerTrainParam
 #include "../../../../src/tree/param.h"                       // for TrainParam
 #include "../../categorical_helpers.h"                        // for OneHotEncodeFeature
 #include "../../helpers.h"
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index dceae5d2b010..367ec382e98f 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -21,7 +21,7 @@
 #include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator, TreeEvaluator
 #include "../../../../src/tree/hist/expand_entry.h"     // for CPUExpandEntry
 #include "../../../../src/tree/hist/hist_cache.h"       // for BoundedHistCollection
-#include "../../../../src/tree/hist/param.h"            // for HistMakerTrainParam
+#include "../../../../src/tree/hist/hist_param.h"       // for HistMakerTrainParam
 #include "../../../../src/tree/param.h"                 // for GradStats, TrainParam
 #include "../../helpers.h"                              // for RandomDataGenerator, AllThreadsFo...
 
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 5ab0c599ea6a..f8bff7656aa9 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -30,7 +30,7 @@
 #include "../../../../src/tree/hist/expand_entry.h"       // for CPUExpandEntry
 #include "../../../../src/tree/hist/hist_cache.h"         // for BoundedHistCollection
 #include "../../../../src/tree/hist/histogram.h"          // for HistogramBuilder
-#include "../../../../src/tree/hist/param.h"              // for HistMakerTrainParam
+#include "../../../../src/tree/hist/hist_param.h"         // for HistMakerTrainParam
 #include "../../categorical_helpers.h"                    // for OneHotEncodeFeature
 #include "../../collective/test_worker.h"                 // for TestDistributedGlobal
 #include "../../helpers.h"                                // for RandomDataGenerator, GenerateRa...

From ffe95e8840e163b50fe582f2d65b9010b86c2eee Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Apr 2025 02:19:53 +0800
Subject: [PATCH 028/224] Make CUDA check compatible with future versions.
 (#11398)

---
 src/common/device_helpers.cu | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu
index 608a535cd8cb..1bc3171a8959 100644
--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@@ -1,19 +1,39 @@
 /**
- * Copyright 2024, XGBoost contributors
+ * Copyright 2024-2025, XGBoost contributors
  */
+#include <mutex>  // for once_flag, call_once
+
 #include "cuda_rt_utils.h"  // for RtVersion
 #include "device_helpers.cuh"
+#include "device_vector.cuh"  // for GrowOnlyVirtualMemVec
 #include "xgboost/windefs.h"  // for xgboost_IS_WIN
 
 namespace dh {
+namespace {
+// Check whether cuda virtual memory can be used.
+// Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable
+[[nodiscard]] bool CheckVmAlloc() {
+  static bool vm_flag = true;
+  static std::once_flag once;
+
+  std::call_once(once, [] {
+    std::int32_t major{0}, minor{0};
+    xgboost::curt::DrVersion(&major, &minor);
+    if (major > 12 || (major == 12 && minor >= 5)) {
+      vm_flag = true;
+    } else {
+      vm_flag = false;
+    }
+  });
+  return vm_flag;
+}
+}  // namespace
+
 PinnedMemory::PinnedMemory() {
 #if defined(xgboost_IS_WIN)
   this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
 #else
-  std::int32_t major{0}, minor{0};
-  xgboost::curt::DrVersion(&major, &minor);
-  // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
-  if (major >= 12 && minor >= 5) {
+  if (CheckVmAlloc()) {
     this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
   } else {
     this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();

From 47924fb31b899d9868fe404f0f4d4407a8f98455 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Apr 2025 04:00:03 +0800
Subject: [PATCH 029/224] Share Python updater tests between CPU and GPU.
 (#11396)

---
 python-package/xgboost/testing/updater.py     | 154 ++++++++++++++++-
 tests/python-gpu/test_gpu_updaters.py         |  35 ++--
 tests/python/test_updaters.py                 | 155 ++----------------
 .../test_with_dask/test_with_dask.py          |   1 +
 4 files changed, 185 insertions(+), 160 deletions(-)

diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index b610ab94f4ec..e98ca471e188 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -2,7 +2,8 @@
 
 import json
 from functools import partial, update_wrapper
-from typing import Any, Dict, List, Union, overload
+from string import ascii_lowercase
+from typing import Any, Dict, List, Literal, TypeAlias, Union, overload
 
 import numpy as np
 import pytest
@@ -14,6 +15,8 @@
 from ..core import DataIter
 from .data_iter import CatIter
 
+Device: TypeAlias = Literal["cpu", "cuda"]
+
 
 @overload
 def get_basescore(model: xgb.XGBModel) -> float: ...
@@ -574,6 +577,155 @@ def run(max_cat_to_onehot: int) -> None:
     run(USE_PART)
 
 
+def run_max_cat(tree_method: str, device: Device) -> None:
+    """Test data with size smaller than number of categories."""
+    import pandas as pd
+
+    rng = np.random.default_rng(0)
+    n_cat = 100
+    n = 5
+
+    X = pd.Series(
+        ["".join(rng.choice(list(ascii_lowercase), size=3)) for i in range(n_cat)],
+        dtype="category",
+    )[:n].to_frame()
+
+    reg = xgb.XGBRegressor(
+        enable_categorical=True,
+        tree_method=tree_method,
+        device=device,
+        n_estimators=10,
+    )
+    y = pd.Series(range(n))
+    reg.fit(X=X, y=y, eval_set=[(X, y)])
+    assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
+
+
+def run_invalid_category(tree_method: str, device: Device) -> None:
+    """Test with invalid categorical inputs."""
+    rng = np.random.default_rng()
+    # too large
+    X = rng.integers(low=0, high=4, size=1000).reshape(100, 10)
+    y = rng.normal(loc=0, scale=1, size=100)
+    X[13, 7] = np.iinfo(np.int32).max + 1
+
+    # Check is performed during sketching.
+    Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
+    with pytest.raises(ValueError):
+        xgb.train({"tree_method": tree_method, "device": device}, Xy)
+
+    X[13, 7] = 16777216
+    Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
+    with pytest.raises(ValueError):
+        xgb.train({"tree_method": tree_method, "device": device}, Xy)
+
+    # mixed positive and negative values
+    X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)  # type: ignore
+    y = rng.normal(loc=0, scale=1, size=100)
+
+    Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
+    with pytest.raises(ValueError):
+        xgb.train({"tree_method": tree_method, "device": device}, Xy)
+
+    if device == "cuda":
+        import cupy as cp
+
+        X, y = cp.array(X), cp.array(y)
+        with pytest.raises(ValueError):
+            Xy = xgb.QuantileDMatrix(X, y, feature_types=["c"] * 10)
+
+
+def run_adaptive(tree_method: str, weighted: bool, device: Device) -> None:
+    """Test for adaptive trees."""
+    rng = np.random.RandomState(1994)
+    from sklearn.datasets import make_regression
+    from sklearn.utils import stats
+
+    n_samples = 256
+    X, y = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
+        n_samples, 16, random_state=rng
+    )
+    if weighted:
+        w = rng.normal(size=n_samples)
+        w -= w.min()
+        Xy = xgb.DMatrix(X, y, weight=w)
+        base_score = stats._weighted_percentile(  # pylint: disable=protected-access
+            y, w, percentile=50
+        )
+    else:
+        Xy = xgb.DMatrix(X, y)
+        base_score = np.median(y)
+
+    booster_0 = xgb.train(
+        {
+            "tree_method": tree_method,
+            "base_score": base_score,
+            "objective": "reg:absoluteerror",
+            "device": device,
+        },
+        Xy,
+        num_boost_round=1,
+    )
+    booster_1 = xgb.train(
+        {
+            "tree_method": tree_method,
+            "objective": "reg:absoluteerror",
+            "device": device,
+        },
+        Xy,
+        num_boost_round=1,
+    )
+    config_0 = json.loads(booster_0.save_config())
+    config_1 = json.loads(booster_1.save_config())
+
+    def get_score(config: Dict) -> float:
+        return float(config["learner"]["learner_model_param"]["base_score"])
+
+    assert get_score(config_0) == get_score(config_1)
+
+    with pytest.warns(Warning, match="Model format is default to UBJSON"):
+        raw_booster = booster_1.save_raw(raw_format="deprecated")
+    booster_2 = xgb.Booster(model_file=raw_booster)
+    config_2 = json.loads(booster_2.save_config())
+    assert get_score(config_1) == get_score(config_2)
+
+    raw_booster = booster_1.save_raw(raw_format="ubj")
+    booster_2 = xgb.Booster(model_file=raw_booster)
+    config_2 = json.loads(booster_2.save_config())
+    assert get_score(config_1) == get_score(config_2)
+
+    booster_0 = xgb.train(
+        {
+            "tree_method": tree_method,
+            "base_score": base_score + 1.0,
+            "objective": "reg:absoluteerror",
+            "device": device,
+        },
+        Xy,
+        num_boost_round=1,
+    )
+    config_0 = json.loads(booster_0.save_config())
+    np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
+
+    evals_result: Dict[str, Dict[str, list]] = {}
+    xgb.train(
+        {
+            "tree_method": tree_method,
+            "device": device,
+            "objective": "reg:absoluteerror",
+            "subsample": 0.8,
+            "eta": 1.0,
+        },
+        Xy,
+        num_boost_round=10,
+        evals=[(Xy, "Train")],
+        evals_result=evals_result,
+    )
+    mae = evals_result["Train"]["mae"]
+    assert mae[-1] < 20.0
+    assert tm.non_increasing(mae)
+
+
 def train_result(
     param: Dict[str, Any], dmat: xgb.DMatrix, num_rounds: int
 ) -> Dict[str, Any]:
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 0d1a48201ab9..9a4778685f40 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -1,4 +1,4 @@
-import sys
+from itertools import product
 from typing import Any, Dict
 
 import numpy as np
@@ -19,12 +19,12 @@
     check_get_quantile_cut,
     check_init_estimation,
     check_quantile_loss,
+    run_adaptive,
+    run_invalid_category,
+    run_max_cat,
     train_result,
 )
 
-sys.path.append("tests/python")
-import test_updaters as test_up
-
 pytestmark = tm.timeout(30)
 
 
@@ -43,8 +43,6 @@ def test_hist(self, param, num_rounds, dataset):
 
 
 class TestGPUUpdaters:
-    cputest = test_up.TestTreeMethod()
-
     @given(
         exact_parameter_strategy,
         hist_parameter_strategy,
@@ -123,7 +121,7 @@ def test_sparse(self, dataset):
         note(str(hist_result))
         assert tm.non_increasing(hist_result["train"][dataset.metric])
 
-        param = {"tree_method": "gpu_hist", "max_bin": 64}
+        param = {"tree_method": "hist", "max_bin": 64, "device": "cuda"}
         gpu_hist_result = train_result(param, dataset.get_dmat(), 16)
         note(str(gpu_hist_result))
         assert tm.non_increasing(gpu_hist_result["train"][dataset.metric])
@@ -230,8 +228,9 @@ def test_categorical_missing(self, rows: int, cols: int, cats: int) -> None:
         )
 
     @pytest.mark.skipif(**tm.no_pandas())
-    def test_max_cat(self) -> None:
-        self.cputest.run_max_cat("gpu_hist")
+    @pytest.mark.parametrize("tree_method", ["hist", "approx"])
+    def test_max_cat(self, tree_method: str) -> None:
+        run_max_cat(tree_method, "cuda")
 
     def test_categorical_32_cat(self):
         """32 hits the bound of integer bitset, so special test"""
@@ -247,8 +246,9 @@ def test_categorical_32_cat(self):
         )
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_invalid_category(self):
-        self.cputest.run_invalid_category("gpu_hist")
+    @pytest.mark.parametrize("tree_method", ["hist", "approx"])
+    def test_invalid_category(self, tree_method: str) -> None:
+        run_invalid_category(tree_method, "cuda")
 
     @pytest.mark.skipif(**tm.no_cupy())
     @given(
@@ -303,7 +303,7 @@ def test_empty_dmatrix_prediction(self):
         dtrain = xgb.DMatrix(X, y)
 
         bst = xgb.train(
-            {"verbosity": 2, "tree_method": "gpu_hist", "gpu_id": 0},
+            {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
             dtrain,
             verbose_eval=True,
             num_boost_round=6,
@@ -330,9 +330,11 @@ def test_specified_gpu_id_gpu_update(
         assert tm.non_increasing(result["train"][dataset.metric])
 
     @pytest.mark.skipif(**tm.no_sklearn())
-    @pytest.mark.parametrize("weighted", [True, False])
-    def test_adaptive(self, weighted) -> None:
-        self.cputest.run_adaptive("gpu_hist", weighted)
+    @pytest.mark.parametrize(
+        "tree_method,weighted", list(product(["approx", "hist"], [True, False]))
+    )
+    def test_adaptive(self, tree_method: str, weighted: bool) -> None:
+        run_adaptive(tree_method, weighted, "cuda")
 
     def test_init_estimation(self) -> None:
         check_init_estimation("gpu_hist")
@@ -357,7 +359,8 @@ def test_issue8824(self):
                 "max_depth": 5,
                 "learning_rate": 0.05,
                 "objective": "binary:logistic",
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda",
                 "colsample_bytree": 0.5,
                 "colsample_bylevel": 0.5,
                 "colsample_bynode": 0.5,  # Causes issues
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 3f2eb8aed52f..aabf72d13202 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -1,5 +1,4 @@
-import json
-from string import ascii_lowercase
+from itertools import product
 from typing import Any, Dict
 
 import numpy as np
@@ -20,6 +19,9 @@
     check_get_quantile_cut,
     check_init_estimation,
     check_quantile_loss,
+    run_adaptive,
+    run_invalid_category,
+    run_max_cat,
     train_result,
 )
 
@@ -219,68 +221,14 @@ def test_sparse(self, dataset):
             hist_result["train"]["rmse"], approx_result["train"]["rmse"]
         )
 
-    def run_invalid_category(self, tree_method: str) -> None:
-        rng = np.random.default_rng()
-        # too large
-        X = rng.integers(low=0, high=4, size=1000).reshape(100, 10)
-        y = rng.normal(loc=0, scale=1, size=100)
-        X[13, 7] = np.iinfo(np.int32).max + 1
-
-        # Check is performed during sketching.
-        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
-        with pytest.raises(ValueError):
-            xgb.train({"tree_method": tree_method}, Xy)
-
-        X[13, 7] = 16777216
-        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
-        with pytest.raises(ValueError):
-            xgb.train({"tree_method": tree_method}, Xy)
-
-        # mixed positive and negative values
-        X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
-        y = rng.normal(loc=0, scale=1, size=100)
-
-        Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
-        with pytest.raises(ValueError):
-            xgb.train({"tree_method": tree_method}, Xy)
-
-        if tree_method == "gpu_hist":
-            import cupy as cp
-
-            X, y = cp.array(X), cp.array(y)
-            with pytest.raises(ValueError):
-                Xy = xgb.QuantileDMatrix(X, y, feature_types=["c"] * 10)
-
-    def test_invalid_category(self) -> None:
-        self.run_invalid_category("approx")
-        self.run_invalid_category("hist")
-
-    def run_max_cat(self, tree_method: str) -> None:
-        """Test data with size smaller than number of categories."""
-        import pandas as pd
-
-        rng = np.random.default_rng(0)
-        n_cat = 100
-        n = 5
-
-        X = pd.Series(
-            ["".join(rng.choice(list(ascii_lowercase), size=3)) for i in range(n_cat)],
-            dtype="category",
-        )[:n].to_frame()
-
-        reg = xgb.XGBRegressor(
-            enable_categorical=True,
-            tree_method=tree_method,
-            n_estimators=10,
-        )
-        y = pd.Series(range(n))
-        reg.fit(X=X, y=y, eval_set=[(X, y)])
-        assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
+    @pytest.mark.parametrize("tree_method", ["hist", "approx"])
+    def test_invalid_category(self, tree_method: str) -> None:
+        run_invalid_category(tree_method, "cpu")
 
     @pytest.mark.parametrize("tree_method", ["hist", "approx"])
     @pytest.mark.skipif(**tm.no_pandas())
-    def test_max_cat(self, tree_method) -> None:
-        self.run_max_cat(tree_method)
+    def test_max_cat(self, tree_method: str) -> None:
+        run_max_cat(tree_method, "cpu")
 
     @given(
         strategies.integers(10, 400),
@@ -373,91 +321,12 @@ def test_categorical_missing(self, rows: int, cols: int, cats: int) -> None:
             rows, cols, cats, device="cpu", tree_method="hist", extmem=False
         )
 
-    def run_adaptive(self, tree_method, weighted) -> None:
-        rng = np.random.RandomState(1994)
-        from sklearn.datasets import make_regression
-        from sklearn.utils import stats
-
-        n_samples = 256
-        X, y = make_regression(n_samples, 16, random_state=rng)
-        if weighted:
-            w = rng.normal(size=n_samples)
-            w -= w.min()
-            Xy = xgb.DMatrix(X, y, weight=w)
-            base_score = stats._weighted_percentile(y, w, percentile=50)
-        else:
-            Xy = xgb.DMatrix(X, y)
-            base_score = np.median(y)
-
-        booster_0 = xgb.train(
-            {
-                "tree_method": tree_method,
-                "base_score": base_score,
-                "objective": "reg:absoluteerror",
-            },
-            Xy,
-            num_boost_round=1,
-        )
-        booster_1 = xgb.train(
-            {"tree_method": tree_method, "objective": "reg:absoluteerror"},
-            Xy,
-            num_boost_round=1,
-        )
-        config_0 = json.loads(booster_0.save_config())
-        config_1 = json.loads(booster_1.save_config())
-
-        def get_score(config: Dict) -> float:
-            return float(config["learner"]["learner_model_param"]["base_score"])
-
-        assert get_score(config_0) == get_score(config_1)
-
-        with pytest.warns(Warning, match="Model format is default to UBJSON"):
-            raw_booster = booster_1.save_raw(raw_format="deprecated")
-        booster_2 = xgb.Booster(model_file=raw_booster)
-        config_2 = json.loads(booster_2.save_config())
-        assert get_score(config_1) == get_score(config_2)
-
-        raw_booster = booster_1.save_raw(raw_format="ubj")
-        booster_2 = xgb.Booster(model_file=raw_booster)
-        config_2 = json.loads(booster_2.save_config())
-        assert get_score(config_1) == get_score(config_2)
-
-        booster_0 = xgb.train(
-            {
-                "tree_method": tree_method,
-                "base_score": base_score + 1.0,
-                "objective": "reg:absoluteerror",
-            },
-            Xy,
-            num_boost_round=1,
-        )
-        config_0 = json.loads(booster_0.save_config())
-        np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
-
-        evals_result: Dict[str, Dict[str, list]] = {}
-        xgb.train(
-            {
-                "tree_method": tree_method,
-                "objective": "reg:absoluteerror",
-                "subsample": 0.8,
-                "eta": 1.0,
-            },
-            Xy,
-            num_boost_round=10,
-            evals=[(Xy, "Train")],
-            evals_result=evals_result,
-        )
-        mae = evals_result["Train"]["mae"]
-        assert mae[-1] < 20.0
-        assert tm.non_increasing(mae)
-
     @pytest.mark.skipif(**tm.no_sklearn())
     @pytest.mark.parametrize(
-        "tree_method,weighted",
-        [("approx", False), ("hist", False), ("approx", True), ("hist", True)],
+        "tree_method,weighted", list(product(["approx", "hist"], [True, False]))
     )
-    def test_adaptive(self, tree_method, weighted) -> None:
-        self.run_adaptive(tree_method, weighted)
+    def test_adaptive(self, tree_method: str, weighted: bool) -> None:
+        run_adaptive(tree_method, weighted, "cpu")
 
     def test_init_estimation(self) -> None:
         check_init_estimation("hist")
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 74eda37854a8..dcfe9f1c0032 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -2216,6 +2216,7 @@ async def test_worker_left(c: Client, s: Scheduler, a: Worker, b: Worker):
     clean_kwargs={"processes": False, "threads": False},
     allow_unclosed=True,
 )
+@pytest.mark.skip
 async def test_worker_restarted(c, s, a, b):
     dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
     dy = da.random.random((1000,)).rechunk(chunks=(10,))

From bb7b7891fd04f435c34709732f170bbf13ebd4ca Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 9 Apr 2025 19:35:01 -0700
Subject: [PATCH 030/224] [jvm-packages] Update helper script for publishing
 Maven Central (#11393) (#11401)

* Update helper script for publishing Maven Central

* Show progress for GPU variant

* Only upload xgboost4j-spark-gpu when gpu variant is selected

* Insert required metadata for xgboost4j-spark-gpu

* Restore xgboost4j-spark-gpu/pom.xml for every invocation

* Insert metadata first

* Work around issue with gtest from dmlc-core

* Put metadata in xgboost4j-spark-gpu/pom.xml

* Revert "Work around issue with gtest from dmlc-core"

This reverts commit bdf0a0656bb287f45fb28a7dae5818f4eeb6e315.

* Support CMake 4.0
---
 dev/prepare_jvm_release.py               | 240 +++++++++++++----------
 jvm-packages/pom.xml                     |  10 +-
 jvm-packages/xgboost4j-spark-gpu/pom.xml |  31 +++
 ops/script/change_scala_version.py       |   7 +-
 4 files changed, 171 insertions(+), 117 deletions(-)

diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py
index c5a72724f707..95a3d4e7adf9 100644
--- a/dev/prepare_jvm_release.py
+++ b/dev/prepare_jvm_release.py
@@ -1,3 +1,33 @@
+"""
+Helper script to prepare for releasing XGBoost JVM packages to
+Maven Central.
+
+## Prerequisite
+
+1. You must have the right to upload artifacts to the Maven Central repo.
+   If you do not, contact Hyunsu Cho (chohyu01@cs.washington.edu) so that
+   he can contact Sonatype on your behalf in order to add you as a
+   "producer" user for the ml.dmlc namespace. See
+   https://central.sonatype.org/pages/support/#status to learn about
+   the process for adding or removing users who can publish to the project.
+
+2. Follow instructions in
+   https://central.sonatype.org/publish/publish-portal-maven/#credentials
+   to set up the authentication token in your machine.
+
+3. Set up GPG for signing artifacts:
+   https://central.sonatype.org/publish/requirements/gpg/
+
+## Making the release
+Run this script 4 times:
+
+python3 dev/prepare_jvm_release.py --scala-version 2.12 --variant cpu
+python3 dev/prepare_jvm_release.py --scala-version 2.12 --variant gpu
+python3 dev/prepare_jvm_release.py --scala-version 2.13 --variant cpu
+python3 dev/prepare_jvm_release.py --scala-version 2.13 --variant gpu
+
+"""
+
 import argparse
 import errno
 import glob
@@ -52,7 +82,7 @@ def cd(path):
 
 def run(command, **kwargs):
     print(command)
-    subprocess.check_call(command, shell=True, **kwargs)
+    subprocess.run(command, shell=True, check=True, **kwargs)
 
 
 def get_current_commit_hash():
@@ -74,141 +104,135 @@ def retrieve(url, filename=None):
 
 
 def main():
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawTextHelpFormatter
+    )
     parser.add_argument(
         "--release-version",
         type=str,
         required=True,
         help="Version of the release being prepared",
     )
+    parser.add_argument(
+        "--scala-version",
+        type=str,
+        required=True,
+        help="Version of Scala to use in the JVM packages",
+        choices=["2.12", "2.13"],
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        required=True,
+        choices=["cpu", "gpu"],
+        help="JVM package variant to package and publish",
+    )
+
     args = parser.parse_args()
     version = args.release_version
+    scala_version = args.scala_version
+    use_cuda = args.variant == "gpu"
 
     commit_hash = get_current_commit_hash()
     git_branch = get_current_git_branch()
-    print(
-        f"Using commit {commit_hash} of branch {git_branch}"
+    print(f"Using commit {commit_hash} of branch {git_branch}")
+    print(f"====Update pom.xml to use Scala {scala_version}====")
+    run(
+        f"{sys.executable} ops/script/change_scala_version.py "
+        f"--scala-version {scala_version} --purge-artifacts"
     )
 
     with cd("jvm-packages/"):
-        print("====copying pure-Python tracker====")
-        for use_cuda in [True, False]:
-            xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
-            cp(
-                "../python-package/xgboost/tracker.py",
-                f"{xgboost4j}/src/main/resources",
-            )
-
-        print("====copying resources for testing====")
+        print("====Copying resources for testing====")
         with cd("../demo/CLI/regression"):
             run(f"{sys.executable} mapfeat.py")
             run(f"{sys.executable} mknfold.py machine.txt 1")
-        for use_cuda in [True, False]:
-            xgboost4j = "xgboost4j-gpu" if use_cuda else "xgboost4j"
-            xgboost4j_spark = "xgboost4j-spark-gpu" if use_cuda else "xgboost4j-spark"
-            maybe_makedirs(f"{xgboost4j}/src/test/resources")
-            maybe_makedirs(f"{xgboost4j_spark}/src/test/resources")
-            for file in glob.glob("../demo/data/agaricus.*"):
-                cp(file, f"{xgboost4j}/src/test/resources")
-                cp(file, f"{xgboost4j_spark}/src/test/resources")
-            for file in glob.glob("../demo/CLI/regression/machine.txt.t*"):
-                cp(file, f"{xgboost4j_spark}/src/test/resources")
+        xgboost4j_spark = "xgboost4j-spark-gpu" if use_cuda else "xgboost4j-spark"
+        maybe_makedirs(f"xgboost4j/src/test/resources")
+        maybe_makedirs(f"{xgboost4j_spark}/src/test/resources")
+        for file in glob.glob("../demo/data/agaricus.*"):
+            cp(file, f"xgboost4j/src/test/resources")
+            cp(file, f"{xgboost4j_spark}/src/test/resources")
+        for file in glob.glob("../demo/CLI/regression/machine.txt.t*"):
+            cp(file, f"{xgboost4j_spark}/src/test/resources")
 
         print("====Creating directories to hold native binaries====")
-        for os_ident, arch in [
-            ("linux", "x86_64"),
-            ("linux", "aarch64"),
-            ("windows", "x86_64"),
-            ("macos", "x86_64"),
-            ("macos", "aarch64"),
-        ]:
+        if use_cuda:
+            # TODO(hcho3): Add GPU build for linux aarch64
+            matrix = [("linux", "x86_64")]
+        else:
+            matrix = [
+                ("linux", "x86_64"),
+                ("linux", "aarch64"),
+                ("windows", "x86_64"),
+                ("macos", "x86_64"),
+                ("macos", "aarch64"),
+            ]
+        for os_ident, arch in matrix:
             output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
             maybe_makedirs(output_dir)
-        for os_ident, arch in [("linux", "x86_64")]:
-            output_dir = f"xgboost4j-gpu/src/main/resources/lib/{os_ident}/{arch}"
-            maybe_makedirs(output_dir)
 
         print("====Downloading native binaries from CI====")
-        nightly_bucket_prefix = (
-            "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
-        )
-        maven_repo_prefix = (
-            "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
-        )
-
-        retrieve(
-            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/xgboost4j_{commit_hash}.dll",
-            filename="xgboost4j/src/main/resources/lib/windows/x86_64/xgboost4j.dll",
-        )
-        retrieve(
-            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_linux_x86_64_{commit_hash}.so",
-            filename="xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
-        )
-        retrieve(
-            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_linux_arm64_{commit_hash}.so",
-            filename="xgboost4j/src/main/resources/lib/linux/aarch64/libxgboost4j.so",
-        )
-        retrieve(
-            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
-            filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
-        )
-        retrieve(
-            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_m1_{commit_hash}.dylib",
-            filename="xgboost4j/src/main/resources/lib/macos/aarch64/libxgboost4j.dylib",
-        )
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            # libxgboost4j.so for Linux x86_64, GPU support
-            zip_path = os.path.join(tempdir, "xgboost4j-gpu_2.12.jar")
-            extract_dir = os.path.join(tempdir, "xgboost4j-gpu")
-            retrieve(
-                url=f"{maven_repo_prefix}/xgboost4j-gpu_2.12/{version}/"
-                f"xgboost4j-gpu_2.12-{version}.jar",
-                filename=zip_path,
+        if use_cuda:
+            url_prefix = (
+                "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/ml/dmlc"
+            )
+            with tempfile.TemporaryDirectory() as tempdir:
+                # libxgboost4j.so for Linux x86_64, GPU support
+                zip_path = os.path.join(tempdir, "xgboost4j-spark-gpu_2.12.jar")
+                extract_dir = os.path.join(tempdir, "xgboost4j-spark-gpu")
+                retrieve(
+                    url=f"{url_prefix}/xgboost4j-spark-gpu_2.12/{version}/"
+                    f"xgboost4j-spark-gpu_2.12-{version}.jar",
+                    filename=zip_path,
+                )
+                os.mkdir(extract_dir)
+                with zipfile.ZipFile(zip_path, "r") as t:
+                    t.extractall(extract_dir)
+                cp(
+                    os.path.join(
+                        extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"
+                    ),
+                    "xgboost4j/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
+                )
+            run(
+                "mvn --no-transfer-progress install -Pgpu "
+                "-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true"
             )
-            os.mkdir(extract_dir)
-            with zipfile.ZipFile(zip_path, "r") as t:
-                t.extractall(extract_dir)
-            cp(
-                os.path.join(extract_dir, "lib", "linux", "x86_64", "libxgboost4j.so"),
-                "xgboost4j-gpu/src/main/resources/lib/linux/x86_64/libxgboost4j.so",
+            run(
+                "mvn deploy -Pgpu,release -pl xgboost4j-spark-gpu "
+                "-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true"
+            )
+        else:
+            url_prefix = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds"
+            for os_ident, arch, src_libname, dest_libname in [
+                ("linux", "x86_64", "libxgboost4j_linux_x86_64.so", "libxgboost4j.so"),
+                (
+                    "linux",
+                    "aarch64",
+                    "libxgboost4j_linux_aarch64.so",
+                    "libxgboost4j.so",
+                ),
+                ("windows", "x86_64", "xgboost4j.dll", "xgboost4j.dll"),
+                ("macos", "x86_64", "libxgboost4j_intel.dylib", "libxgboost4j.dylib"),
+                ("macos", "aarch64", "libxgboost4j_m1.dylib", "libxgboost4j.dylib"),
+            ]:
+                retrieve(
+                    url=f"{url_prefix}/{git_branch}/{commit_hash}/{src_libname}",
+                    filename=(
+                        "xgboost4j/src/main/resources/lib/"
+                        f"{os_ident}/{arch}/{dest_libname}"
+                    ),
+                )
+            run(
+                "mvn --no-transfer-progress deploy -Pdefault,release "
+                "-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true"
             )
 
     print("====Next Steps====")
-    print("1. Gain upload right to Maven Central repo.")
-    print("1-1. Sign up for a JIRA account at Sonatype: ")
-    print(
-        "1-2. File a JIRA ticket: "
-        "https://issues.sonatype.org/secure/CreateIssue.jspa?issuetype=21&pid=10134.%20Example:"
-        "https://issues.sonatype.org/browse/OSSRH-67724"
-    )
-    print(
-        "2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
-        "https://central.sonatype.org/publish/publish-maven/"
-    )
-    print(
-        "3. Now on a Linux machine, run the following to build Scala 2.12 artifacts. "
-        "Make sure to use an Internet connection with fast upload speed:"
-    )
-    print(
-        "   # Skip native build, since we have all needed native binaries from CI\n"
-        "   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true"
-    )
-    print(
-        "4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
-        "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
-        "to inspect the staged JAR files. Finally, press Release button to publish the "
-        "artifacts to the Maven Central repository. The top-level metapackage should be "
-        "named xgboost-jvm_2.12."
-    )
-    print(
-        "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
-        "   python ops/script/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
-        "   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true"
-    )
     print(
-        "6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts. "
-        "The top-level metapackage should be named xgboost-jvm_2.13."
+        "Visit https://central.sonatype.com/publishing/deployments to verify the deployment. "
+        "You can either drop the deployment or publish it. Note: publishing is final."
     )
 
 
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index f4cadbf9a787..16347e85f1fc 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -197,14 +197,12 @@
                         </executions>
                     </plugin>
                     <plugin>
-                        <groupId>org.sonatype.plugins</groupId>
-                        <artifactId>nexus-staging-maven-plugin</artifactId>
-                        <version>1.7.0</version>
+                        <groupId>org.sonatype.central</groupId>
+                        <artifactId>central-publishing-maven-plugin</artifactId>
+                        <version>0.7.0</version>
                         <extensions>true</extensions>
                         <configuration>
-                            <serverId>ossrh</serverId>
-                            <nexusUrl>https://oss.sonatype.org/</nexusUrl>
-                            <autoReleaseAfterClose>false</autoReleaseAfterClose>
+                            <publishingServerId>central</publishingServerId>
                         </configuration>
                     </plugin>
                     <plugin>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index a4768878f879..87d6ab78e04f 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -10,6 +10,37 @@
     </parent>
     <name>xgboost4j-spark-gpu</name>
     <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
+    <description>JVM Package for XGBoost</description>
+    <url>https://github.com/dmlc/xgboost/tree/master/jvm-packages</url>
+    <licenses>
+        <license>
+            <name>The Apache License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+        </license>
+    </licenses>
+    <developers>
+        <developer>
+            <name>Bobby Wang</name>
+            <email>wbo4958@gmail.com</email>
+        </developer>
+        <developer>
+            <name>Jiaming Yuan</name>
+            <email>jm.yuan@outlook.com</email>
+        </developer>
+        <developer>
+            <name>Hyunsu Cho</name>
+            <email>chohyu01@cs.washington.edu</email>
+        </developer>
+        <developer>
+            <name>CodingCat</name>
+            <email>codingcat@apache.org</email>
+        </developer>
+    </developers>
+    <scm>
+        <connection>scm:git:git:/github.com/dmlc/xgboost.git</connection>
+        <developerConnection>scm:git:ssh://github.com/dmlc/xgboost.git</developerConnection>
+        <url>https://github.com/dmlc/xgboost</url>
+    </scm>
     <build>
         <plugins>
             <plugin>
diff --git a/ops/script/change_scala_version.py b/ops/script/change_scala_version.py
index ed475a1f9582..4dfdd788c167 100644
--- a/ops/script/change_scala_version.py
+++ b/ops/script/change_scala_version.py
@@ -20,9 +20,10 @@ def main(args: argparse.Namespace) -> None:
             if target.is_dir():
                 print(f"Removing {target}...")
                 shutil.rmtree(target)
-        for target in pathlib.Path("jvm-packages/").glob("**/*.so"):
-            print(f"Removing {target}...")
-            target.unlink()
+        for ext in ["so", "dll", "dylib"]:
+            for target in pathlib.Path("jvm-packages/").glob(f"**/*.{ext}"):
+                print(f"Removing {target}...")
+                target.unlink()
 
     # Update pom.xml
     for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"):

From f2495365817ad6c68b140028a3343139107b5715 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Apr 2025 13:00:50 +0800
Subject: [PATCH 031/224] [doc] Update notes about GPU memory usage. (#11375)

Mention data iter input.
---
 doc/gpu/index.rst | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 515939723e49..a6910d64267f 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -66,7 +66,15 @@ The dataset itself is stored on device in a compressed ELLPACK format. The ELLPA
 
 Working memory is allocated inside the algorithm proportional to the number of rows to keep track of gradients, tree positions and other per row statistics. Memory is allocated for histogram bins proportional to the number of bins, number of features and nodes in the tree. For performance reasons we keep histograms in memory from previous nodes in the tree, when a certain threshold of memory usage is passed we stop doing this to conserve memory at some performance loss.
 
-If you are getting out-of-memory errors on a big dataset, try the :py:class:`xgboost.QuantileDMatrix` or :doc:`external memory version </tutorials/external_memory>`. Note that when ``external memory`` is used for GPU hist, it's best to employ gradient based sampling as well. Last but not least, ``inplace_predict`` can be preferred over ``predict`` when data is already on GPU. Both ``QuantileDMatrix`` and ``inplace_predict`` are automatically enabled if you are using the scikit-learn interface.
+If you are getting out-of-memory errors on a big dataset, try the
+:py:class:`xgboost.QuantileDMatrix` first. If you have access to NVLink-C2C devices, see
+:doc:`external memory version </tutorials/external_memory>`. In addition,
+:py:meth:`~xgboost.Booster.inplace_predict` should be preferred over ``predict`` when data
+is already on GPU. Both :py:class:`xgboost.QuantileDMatrix` and
+:py:meth:`~xgboost.Booster.inplace_predict` are automatically enabled if you are using the
+scikit-learn interface. Last but not least, using :py:class:`~xgboost.QuantileDMatrix`
+with a data iterator as input is a great way to increase memory capacity, see
+:ref:`sphx_glr_python_examples_quantile_data_iterator.py`.
 
 
 CPU-GPU Interoperability

From 28bb98c271fc6edfc0853f688fb272a7fb148d6e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Apr 2025 16:53:39 +0800
Subject: [PATCH 032/224] Fix loading Gamma model from 1.3. (#11377)

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 src/objective/regression_obj.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index b5e57199f969..f74602475292 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -104,9 +104,7 @@ class RegLossObj : public FitInterceptGlmLike {
   // 0 - scale_pos_weight, 1 - is_null_weight
   RegLossObj(): additional_input_(2) {}
 
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-  }
+  void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
 
   [[nodiscard]] ObjInfo Task() const override { return Loss::Info(); }
 
@@ -208,7 +206,11 @@ class RegLossObj : public FitInterceptGlmLike {
   }
 
   void LoadConfig(Json const& in) override {
-    FromJson(in["reg_loss_param"], &param_);
+    auto obj = get<Object const>(in);
+    auto it = obj.find("reg_loss_param");
+    if (it != obj.cend()) {
+      FromJson(it->second, &param_);
+    }
   }
 
  protected:

From f12877f5b1cb28a28c2796cb1c4cef34c17f84bc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Apr 2025 21:15:58 +0800
Subject: [PATCH 033/224] Remove the updater name `gpu_coord_descent`. (#11395)

Deprecated in 2.0. Use `device=cuda` instead.
---
 src/gbm/gblinear.cc                   | 23 +++++++++++++----------
 tests/cpp/test_learner.cc             |  6 +++---
 tests/cpp/test_serialization.cc       |  9 ++++++---
 tests/python-gpu/test_gpu_pickling.py |  4 ++--
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 2cacfe078b4b..5c12da24af70 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -73,26 +73,29 @@ class GBLinear : public GradientBooster {
       : GradientBooster{ctx},
         learner_model_param_{learner_model_param},
         model_{learner_model_param},
-        previous_model_{learner_model_param} {}
+        previous_model_{learner_model_param} {
+    monitor_.Init(__func__);
+  }
 
   void Configure(const Args& cfg) override {
     if (model_.weight.size() == 0) {
       model_.Configure(cfg);
     }
     param_.UpdateAllowUnknown(cfg);
-    param_.CheckGPUSupport();
     if (param_.updater == "gpu_coord_descent") {
-      LOG(WARNING) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0",
-                                            R"(device="cuda", updater="coord_descent")");
+      LOG(FATAL) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0",
+                                          R"(device="cuda", updater="coord_descent")");
     }
 
-    if (param_.updater == "coord_descent" && ctx_->IsCUDA()) {
-      updater_.reset(LinearUpdater::Create("gpu_coord_descent", ctx_));
-    } else {
-      updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
-    }
+    auto name = (param_.updater == "coord_descent")
+                    // Dispatch for coordinate descent
+                    ? this->ctx_->DispatchDevice([] { return "coord_descent"; },
+                                                 [] { return "gpu_coord_descent"; })
+                    : param_.updater;
+    LOG(INFO) << "Using the updater:" << name;
+
+    updater_.reset(LinearUpdater::Create(name, ctx_));
     updater_->Configure(cfg);
-    monitor_.Init("GBLinear");
   }
 
   int32_t BoostedRounds() const override {
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index a8551aa23ce9..88a0ce0644d2 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -329,9 +329,9 @@ TEST(Learner, GPUConfiguration) {
   p_dmat->Info().labels.Data()->HostVector() = labels;
   p_dmat->Info().labels.Reshape(kRows);
   {
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"booster", "gblinear"},
-                        Arg{"updater", "gpu_coord_descent"}});
+    std::unique_ptr<Learner> learner{Learner::Create(mat)};
+    learner->SetParams(
+        {Arg{"booster", "gblinear"}, Arg{"updater", "coord_descent"}, Arg{"device", "cuda"}});
     learner->UpdateOneIter(0, p_dmat);
     ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 83b4fd93bdb0..0458956333ac 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -437,7 +437,8 @@ TEST_F(SerializationTest, GPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"updater", "gpu_coord_descent"}},
+                            {"device", "cuda"},
+                            {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -599,7 +600,8 @@ TEST_F(LogitSerializationTest, GPUCoordDescent) {
                             {"objective", "binary:logistic"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"updater", "gpu_coord_descent"}},
+                            {"device", "cuda"},
+                            {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -748,7 +750,8 @@ TEST_F(MultiClassesSerializationTest, GPUCoordDescent) {
                             {"num_class", std::to_string(kClasses)},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"updater", "gpu_coord_descent"}},
+                            {"updater", "coord_descent"},
+                            {"device", "cuda"}},
                            fmap_, p_dmat_);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
index 66c86f170374..217705d0ee83 100644
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -71,14 +71,14 @@ def test_pickling(self):
         x, y = build_dataset()
         train_x = xgb.DMatrix(x, label=y)
 
-        param = {"tree_method": "gpu_hist", "gpu_id": 0}
+        param = {"tree_method": "hist", "device": "cuda"}
         bst = xgb.train(param, train_x)
         self.run_pickling(bst)
 
         bst = xgb.XGBRegressor(**param).fit(x, y)
         self.run_pickling(bst)
 
-        param = {"booster": "gblinear", "updater": "gpu_coord_descent", "gpu_id": 0}
+        param = {"booster": "gblinear", "updater": "coord_descent", "device": "cuda"}
         bst = xgb.train(param, train_x)
         self.run_pickling(bst)
 

From fc21dd2d9f58f7146d26cda8b4c76fbc86547682 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 15 Apr 2025 14:28:49 +0800
Subject: [PATCH 034/224] Cleanup `gpu_hist` in Python tests. (#11402)

---
 ops/script/lint_python.py                     |  13 +-
 .../xgboost/testing/basic_models.py           |  81 +++++++
 python-package/xgboost/testing/callbacks.py   | 188 ++++++++++++++++
 .../testing/interaction_constraints.py        |  88 ++++++++
 python-package/xgboost/testing/metrics.py     | 200 ++++++++++++++++--
 .../xgboost/testing/monotone_constraints.py   |  65 ++++++
 python-package/xgboost/testing/ordinal.py     |   3 +-
 python-package/xgboost/testing/parse_tree.py  |  32 +++
 python-package/xgboost/testing/plotting.py    |  29 +++
 python-package/xgboost/testing/ranking.py     |  22 +-
 python-package/xgboost/testing/updater.py     |  31 ++-
 python-package/xgboost/testing/utils.py       |   5 +
 python-package/xgboost/testing/with_skl.py    | 134 ++++++++++++
 src/context.cc                                |   3 +-
 tests/python-gpu/load_pickle.py               |   5 +-
 .../test_device_quantile_dmatrix.py           |  43 ++--
 tests/python-gpu/test_gpu_basic_models.py     |  38 +---
 tests/python-gpu/test_gpu_callbacks.py        |  20 ++
 tests/python-gpu/test_gpu_eval_metrics.py     |  42 ++--
 .../test_gpu_interaction_constraints.py       |  29 ++-
 tests/python-gpu/test_gpu_parse_tree.py       |  18 +-
 tests/python-gpu/test_gpu_pickling.py         |   8 +-
 tests/python-gpu/test_gpu_plotting.py         |  15 +-
 tests/python-gpu/test_gpu_ranking.py          |   6 +-
 tests/python-gpu/test_gpu_updaters.py         |   4 +-
 tests/python-gpu/test_gpu_with_sklearn.py     |  40 ++--
 tests/python-gpu/test_large_input.py          |   4 +-
 .../python-gpu/test_monotonic_constraints.py  |  37 ++--
 tests/python/test_basic_models.py             |  75 +------
 tests/python/test_callback.py                 | 182 +---------------
 tests/python/test_eval_metrics.py             | 191 ++---------------
 tests/python/test_interaction_constraints.py  | 145 ++++---------
 tests/python/test_monotone_constraints.py     |  95 ++++-----
 tests/python/test_parse_tree.py               |  53 ++---
 tests/python/test_pickling.py                 |  14 +-
 tests/python/test_plotting.py                 |  71 +++----
 tests/python/test_updaters.py                 |   4 +-
 tests/python/test_with_polars.py              |   7 +-
 tests/python/test_with_sklearn.py             | 113 ++--------
 39 files changed, 1228 insertions(+), 925 deletions(-)
 create mode 100644 python-package/xgboost/testing/basic_models.py
 create mode 100644 python-package/xgboost/testing/callbacks.py
 create mode 100644 python-package/xgboost/testing/interaction_constraints.py
 create mode 100644 python-package/xgboost/testing/monotone_constraints.py
 create mode 100644 python-package/xgboost/testing/parse_tree.py
 create mode 100644 python-package/xgboost/testing/plotting.py
 create mode 100644 python-package/xgboost/testing/utils.py
 create mode 100644 python-package/xgboost/testing/with_skl.py
 create mode 100644 tests/python-gpu/test_gpu_callbacks.py

diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py
index c4267accebbb..54ea0e570916 100644
--- a/ops/script/lint_python.py
+++ b/ops/script/lint_python.py
@@ -29,14 +29,19 @@ class LintersPaths:
         "tests/python/test_objectives.py",
         "tests/python/test_ordinal.py",
         "tests/python/test_predict.py",
+        "tests/python/test_pickling.py",
+        "tests/python/test_plotting.py",
+        "tests/python/test_parse_tree.py",
         "tests/python/test_quantile_dmatrix.py",
         "tests/python/test_tracker.py",
         "tests/python/test_tree_regularization.py",
         "tests/python/test_training_continuation.py",
         "tests/python/test_shap.py",
         "tests/python/test_updaters.py",
+        "tests/python/test_interaction_constraints.py",
         "tests/python/test_model_io.py",
         "tests/python/test_with_pandas.py",
+        "tests/python/test_with_polars.py",
         "tests/python-gpu/",
         "tests/python-sycl/",
         "tests/test_distributed/test_federated/",
@@ -98,11 +103,15 @@ class LintersPaths:
         "tests/python/test_data_iterator.py",
         "tests/python/test_multi_target.py",
         "tests/python/test_objectives.py",
+        "tests/python/test_model_io.py",
+        "tests/python/test_ordinal.py",
+        "tests/python/test_interaction_constraints.py",
+        "tests/python-gpu/test_gpu_callbacks.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
         "tests/python-gpu/load_pickle.py",
         "tests/python-gpu/test_gpu_training_continuation.py",
-        "tests/python/test_model_io.py",
-        "tests/python/test_ordinal.py",
+        "tests/python-gpu/test_gpu_plotting.py",
+        "tests/python-gpu/test_gpu_parse_tree.py",
         "tests/test_distributed/test_federated/",
         "tests/test_distributed/test_gpu_federated/",
         "tests/test_distributed/test_with_dask/test_ranking.py",
diff --git a/python-package/xgboost/testing/basic_models.py b/python-package/xgboost/testing/basic_models.py
new file mode 100644
index 000000000000..dae99d7380a9
--- /dev/null
+++ b/python-package/xgboost/testing/basic_models.py
@@ -0,0 +1,81 @@
+"""Tests for basic features of the Booster."""
+
+from typing import Tuple
+
+import numpy as np
+
+from xgboost import testing as tm
+
+from ..core import Booster, DMatrix
+from ..training import cv, train
+from .utils import Device
+
+
+def run_custom_objective(  # pylint: disable=too-many-locals
+    tree_method: str,
+    device: Device,
+    dtrain: DMatrix,
+    dtest: DMatrix,
+) -> None:
+    """Tests custom objective and metric functions."""
+    param = {
+        "max_depth": 2,
+        "eta": 1,
+        "objective": "reg:logistic",
+        "tree_method": tree_method,
+        "device": device,
+    }
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
+    num_round = 10
+
+    def evalerror(preds: np.ndarray, dtrain: DMatrix) -> Tuple[str, np.float64]:
+        return tm.eval_error_metric(preds, dtrain, rev_link=True)
+
+    # test custom_objective in training
+    bst = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        obj=tm.logregobj,
+        custom_metric=evalerror,
+    )
+    assert isinstance(bst, Booster)
+    preds = bst.predict(dtest)
+    labels = dtest.get_label()
+    err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(
+        len(preds)
+    )
+    assert err < 0.1
+
+    # test custom_objective in cross-validation
+    cv(
+        param,
+        dtrain,
+        num_round,
+        nfold=5,
+        seed=0,
+        obj=tm.logregobj,
+        custom_metric=evalerror,
+    )
+
+    # test maximize parameter
+    def neg_evalerror(preds: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]:
+        labels = dtrain.get_label()
+        preds = 1.0 / (1.0 + np.exp(-preds))
+        return "error", float(sum(labels == (preds > 0.0))) / len(labels)
+
+    bst2 = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        obj=tm.logregobj,
+        custom_metric=neg_evalerror,
+        maximize=True,
+    )
+    preds2 = bst2.predict(dtest)
+    err2 = sum(
+        1 for i in range(len(preds2)) if int(preds2[i] > 0.5) != labels[i]
+    ) / float(len(preds2))
+    assert err == err2
diff --git a/python-package/xgboost/testing/callbacks.py b/python-package/xgboost/testing/callbacks.py
new file mode 100644
index 000000000000..740b389cfde4
--- /dev/null
+++ b/python-package/xgboost/testing/callbacks.py
@@ -0,0 +1,188 @@
+# pylint: disable=too-many-locals
+"""Tests for callback functions."""
+
+import json
+from itertools import product
+from typing import Dict, List, Tuple
+
+from ..callback import LearningRateScheduler
+from ..core import Booster, DMatrix
+from ..training import cv, train
+from .utils import Device
+
+
+def run_eta_decay(
+    tree_method: str, dtrain: DMatrix, dtest: DMatrix, device: Device
+) -> None:
+    """Test learning rate scheduler, used by both CPU and GPU tests."""
+    scheduler = LearningRateScheduler
+
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
+    num_round = 4
+
+    # learning_rates as a list
+    # init eta with 0 to check whether learning_rates work
+    param = {
+        "max_depth": 2,
+        "eta": 0,
+        "objective": "binary:logistic",
+        "eval_metric": "error",
+        "tree_method": tree_method,
+        "device": device,
+    }
+    evals_result: Dict[str, Dict] = {}
+    bst = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
+        evals_result=evals_result,
+    )
+    eval_errors_0 = list(map(float, evals_result["eval"]["error"]))
+    assert isinstance(bst, Booster)
+    # validation error should decrease, if eta > 0
+    assert eval_errors_0[0] > eval_errors_0[-1]
+
+    # init learning_rate with 0 to check whether learning_rates work
+    param = {
+        "max_depth": 2,
+        "learning_rate": 0,
+        "objective": "binary:logistic",
+        "eval_metric": "error",
+        "tree_method": tree_method,
+        "device": device,
+    }
+    evals_result = {}
+
+    bst = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
+        evals_result=evals_result,
+    )
+    eval_errors_1 = list(map(float, evals_result["eval"]["error"]))
+    assert isinstance(bst, Booster)
+    # validation error should decrease, if learning_rate > 0
+    assert eval_errors_1[0] > eval_errors_1[-1]
+
+    # check if learning_rates override default value of eta/learning_rate
+    param = {
+        "max_depth": 2,
+        "objective": "binary:logistic",
+        "eval_metric": "error",
+        "tree_method": tree_method,
+        "device": device,
+    }
+    evals_result = {}
+    bst = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        callbacks=[scheduler([0, 0, 0, 0])],
+        evals_result=evals_result,
+    )
+    eval_errors_2 = list(map(float, evals_result["eval"]["error"]))
+    assert isinstance(bst, Booster)
+    # validation error should not decrease, if eta/learning_rate = 0
+    assert eval_errors_2[0] == eval_errors_2[-1]
+
+    # learning_rates as a customized decay function
+    def eta_decay(ithround: int, num_boost_round: int = num_round) -> float:
+        return num_boost_round / (ithround + 1)
+
+    evals_result = {}
+    bst = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        callbacks=[scheduler(eta_decay)],
+        evals_result=evals_result,
+    )
+    eval_errors_3 = list(map(float, evals_result["eval"]["error"]))
+
+    assert isinstance(bst, Booster)
+
+    assert eval_errors_3[0] == eval_errors_2[0]
+
+    for i in range(1, len(eval_errors_0)):
+        assert eval_errors_3[i] != eval_errors_2[i]
+
+    cv(param, dtrain, num_round, callbacks=[scheduler(eta_decay)])
+
+
+def tree_methods_objs() -> List[Tuple[str, str]]:
+    """Test parameters for the leaf output test."""
+    return list(
+        product(
+            ["approx", "hist"],
+            [
+                "binary:logistic",
+                "reg:absoluteerror",
+                "reg:quantileerror",
+            ],
+        )
+    )
+
+
+def run_eta_decay_leaf_output(
+    tree_method: str, objective: str, dtrain: DMatrix, dtest: DMatrix, device: Device
+) -> None:
+    """check decay has effect on leaf output."""
+    num_round = 4
+    scheduler = LearningRateScheduler
+
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
+
+    param = {
+        "max_depth": 2,
+        "objective": objective,
+        "eval_metric": "error",
+        "tree_method": tree_method,
+        "device": device,
+    }
+    if objective == "reg:quantileerror":
+        param["quantile_alpha"] = 0.3
+
+    def eta_decay_0(i: int) -> float:
+        return num_round / (i + 1)
+
+    bst0 = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        callbacks=[scheduler(eta_decay_0)],
+    )
+
+    def eta_decay_1(i: int) -> float:
+        if i > 1:
+            return 5.0
+        return num_round / (i + 1)
+
+    bst1 = train(
+        param,
+        dtrain,
+        num_round,
+        evals=watchlist,
+        callbacks=[scheduler(eta_decay_1)],
+    )
+    bst_json0 = bst0.save_raw(raw_format="json")
+    bst_json1 = bst1.save_raw(raw_format="json")
+
+    j0 = json.loads(bst_json0)
+    j1 = json.loads(bst_json1)
+
+    tree_2th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][2]
+    tree_2th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][2]
+    assert tree_2th_0["base_weights"] == tree_2th_1["base_weights"]
+    assert tree_2th_0["split_conditions"] == tree_2th_1["split_conditions"]
+
+    tree_3th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][3]
+    tree_3th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][3]
+    assert tree_3th_0["base_weights"] != tree_3th_1["base_weights"]
+    assert tree_3th_0["split_conditions"] != tree_3th_1["split_conditions"]
diff --git a/python-package/xgboost/testing/interaction_constraints.py b/python-package/xgboost/testing/interaction_constraints.py
new file mode 100644
index 000000000000..4e12a894e695
--- /dev/null
+++ b/python-package/xgboost/testing/interaction_constraints.py
@@ -0,0 +1,88 @@
+"""Tests for interaction constraints."""
+
+from typing import Optional, Sequence, Union
+
+import numpy as np
+
+from .._typing import FeatureNames
+from ..core import DMatrix
+from ..training import train
+from .utils import Device
+
+
+def run_interaction_constraints(  # pylint: disable=too-many-locals
+    tree_method: str,
+    device: Device,
+    feature_names: Optional[FeatureNames] = None,
+    interaction_constraints: Union[str, Sequence] = "[[0, 1]]",
+) -> None:
+    """Tests interaction constraints on a synthetic dataset."""
+    x1 = np.random.normal(loc=1.0, scale=1.0, size=1000)
+    x2 = np.random.normal(loc=1.0, scale=1.0, size=1000)
+    x3 = np.random.choice([1, 2, 3], size=1000, replace=True)
+    y = (
+        x1
+        + x2
+        + x3
+        + x1 * x2 * x3
+        + np.random.normal(loc=0.001, scale=1.0, size=1000)
+        + 3 * np.sin(x1)
+    )
+    X = np.column_stack((x1, x2, x3))
+    dtrain = DMatrix(X, label=y, feature_names=feature_names)
+
+    params = {
+        "max_depth": 3,
+        "eta": 0.1,
+        "nthread": 2,
+        "interaction_constraints": interaction_constraints,
+        "tree_method": tree_method,
+        "device": device,
+    }
+    num_boost_round = 12
+    # Fit a model that only allows interaction between x1 and x2
+    bst = train(params, dtrain, num_boost_round, evals=[(dtrain, "train")])
+
+    # Set all observations to have the same x3 values then increment by the same amount
+    def f(x: int) -> np.ndarray:
+        tmat = DMatrix(
+            np.column_stack((x1, x2, np.repeat(x, 1000))), feature_names=feature_names
+        )
+        return bst.predict(tmat)
+
+    preds = [f(x) for x in [1, 2, 3]]
+
+    # Check incrementing x3 has the same effect on all observations
+    #   since x3 is constrained to be independent of x1 and x2
+    #   and all observations start off from the same x3 value
+    diff1 = preds[1] - preds[0]
+    assert np.all(np.abs(diff1 - diff1[0]) < 1e-4)
+    diff2 = preds[2] - preds[1]
+    assert np.all(np.abs(diff2 - diff2[0]) < 1e-4)
+
+
+def training_accuracy(tree_method: str, dpath: str, device: Device) -> None:
+    """Test accuracy, reused by GPU tests."""
+    from sklearn.metrics import accuracy_score
+
+    dtrain = DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm")
+    dtest = DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm")
+    params = {
+        "eta": 1,
+        "max_depth": 6,
+        "objective": "binary:logistic",
+        "tree_method": tree_method,
+        "device": device,
+        "interaction_constraints": "[[1,2], [2,3,4]]",
+    }
+    num_boost_round = 5
+
+    params["grow_policy"] = "lossguide"
+    bst = train(params, dtrain, num_boost_round)
+    pred_dtest = bst.predict(dtest) < 0.5
+    assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
+
+    params["grow_policy"] = "depthwise"
+    bst = train(params, dtrain, num_boost_round)
+    pred_dtest = bst.predict(dtest) < 0.5
+    assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
diff --git a/python-package/xgboost/testing/metrics.py b/python-package/xgboost/testing/metrics.py
index 802add7b2bf8..f6e5033b4ba3 100644
--- a/python-package/xgboost/testing/metrics.py
+++ b/python-package/xgboost/testing/metrics.py
@@ -5,12 +5,16 @@
 import numpy as np
 import pytest
 
-import xgboost as xgb
-from xgboost.compat import concat
-from xgboost.core import _parse_eval_str
+from ..compat import concat
+from ..core import DMatrix, QuantileDMatrix, _parse_eval_str
+from ..sklearn import XGBClassifier, XGBRanker
+from ..training import train
+from .utils import Device
 
 
-def check_precision_score(tree_method: str) -> None:
+def check_precision_score(  # pylint: disable=too-many-locals
+    tree_method: str, device: Device
+) -> None:
     """Test for precision with ranking and classification."""
     datasets = pytest.importorskip("sklearn.datasets")
 
@@ -19,7 +23,7 @@ def check_precision_score(tree_method: str) -> None:
     )
     qid = np.zeros(shape=y.shape)  # same group
 
-    ltr = xgb.XGBRanker(n_estimators=2, tree_method=tree_method)
+    ltr = XGBRanker(n_estimators=2, tree_method=tree_method, device=device)
     ltr.fit(X, y, qid=qid)
 
     # re-generate so that XGBoost doesn't evaluate the result to 1.0
@@ -28,9 +32,7 @@ def check_precision_score(tree_method: str) -> None:
     )
 
     ltr.set_params(eval_metric="pre@32")
-    result = _parse_eval_str(
-        ltr.get_booster().eval_set(evals=[(xgb.DMatrix(X, y), "Xy")])
-    )
+    result = _parse_eval_str(ltr.get_booster().eval_set(evals=[(DMatrix(X, y), "Xy")]))
     score_0 = result[1][1]
 
     X_list = []
@@ -52,14 +54,14 @@ def check_precision_score(tree_method: str) -> None:
     y = concat(y_list)
 
     result = _parse_eval_str(
-        ltr.get_booster().eval_set(evals=[(xgb.DMatrix(X, y, qid=qid), "Xy")])
+        ltr.get_booster().eval_set(evals=[(DMatrix(X, y, qid=qid), "Xy")])
     )
     assert result[1][0].endswith("pre@32")
     score_1 = result[1][1]
     assert score_1 == score_0
 
 
-def check_quantile_error(tree_method: str) -> None:
+def check_quantile_error(tree_method: str, device: Device) -> None:
     """Test for the `quantile` loss."""
     from sklearn.datasets import make_regression
     from sklearn.metrics import mean_pinball_loss
@@ -67,10 +69,15 @@ def check_quantile_error(tree_method: str) -> None:
     rng = np.random.RandomState(19)
     # pylint: disable=unbalanced-tuple-unpacking
     X, y = make_regression(128, 3, random_state=rng)
-    Xy = xgb.QuantileDMatrix(X, y)
+    Xy = QuantileDMatrix(X, y)
     evals_result: Dict[str, Dict] = {}
-    booster = xgb.train(
-        {"tree_method": tree_method, "eval_metric": "quantile", "quantile_alpha": 0.3},
+    booster = train(
+        {
+            "tree_method": tree_method,
+            "eval_metric": "quantile",
+            "quantile_alpha": 0.3,
+            "device": device,
+        },
         Xy,
         evals=[(Xy, "Train")],
         evals_result=evals_result,
@@ -80,12 +87,13 @@ def check_quantile_error(tree_method: str) -> None:
     np.testing.assert_allclose(evals_result["Train"]["quantile"][-1], loss)
 
     alpha = [0.25, 0.5, 0.75]
-    booster = xgb.train(
+    booster = train(
         {
             "tree_method": tree_method,
             "eval_metric": "quantile",
             "quantile_alpha": alpha,
             "objective": "reg:quantileerror",
+            "device": device,
         },
         Xy,
         evals=[(Xy, "Train")],
@@ -96,3 +104,167 @@ def check_quantile_error(tree_method: str) -> None:
         [mean_pinball_loss(y, predt[:, i], alpha=alpha[i]) for i in range(3)]
     )
     np.testing.assert_allclose(evals_result["Train"]["quantile"][-1], loss)
+
+
+def run_roc_auc_binary(tree_method: str, n_samples: int, device: Device) -> None:
+    """TestROC AUC metric on a binary classification problem."""
+    from sklearn.datasets import make_classification
+    from sklearn.metrics import roc_auc_score
+
+    rng = np.random.RandomState(1994)
+    n_features = 10
+
+    X, y = make_classification(
+        n_samples,
+        n_features,
+        n_informative=n_features,
+        n_redundant=0,
+        random_state=rng,
+    )
+    Xy = DMatrix(X, y)
+    booster = train(
+        {
+            "tree_method": tree_method,
+            "device": device,
+            "eval_metric": "auc",
+            "objective": "binary:logistic",
+        },
+        Xy,
+        num_boost_round=1,
+    )
+    score = booster.predict(Xy)
+    skl_auc = roc_auc_score(y, score)
+    auc = float(booster.eval(Xy).split(":")[1])
+    np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
+
+    X = rng.randn(*X.shape)
+    score = booster.predict(DMatrix(X))
+    skl_auc = roc_auc_score(y, score)
+    auc = float(booster.eval(DMatrix(X, y)).split(":")[1])
+    np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
+
+
+def run_pr_auc_multi(tree_method: str, device: Device) -> None:
+    """Test for PR AUC metric on a multi-class classification problem."""
+    from sklearn.datasets import make_classification
+
+    X, y = make_classification(64, 16, n_informative=8, n_classes=3, random_state=1994)
+    clf = XGBClassifier(
+        tree_method=tree_method, n_estimators=1, eval_metric="aucpr", device=device
+    )
+    clf.fit(X, y, eval_set=[(X, y)])
+    evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
+    # No available implementation for comparison, just check that XGBoost converges
+    # to 1.0
+    clf = XGBClassifier(
+        tree_method=tree_method, n_estimators=10, eval_metric="aucpr", device=device
+    )
+    clf.fit(X, y, eval_set=[(X, y)])
+    evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
+    np.testing.assert_allclose(1.0, evals_result, rtol=1e-2)
+
+
+def run_roc_auc_multi(  # pylint: disable=too-many-locals
+    tree_method: str, n_samples: int, weighted: bool, device: Device
+) -> None:
+    """Test for ROC AUC metric on a multi-class classification problem."""
+    from sklearn.datasets import make_classification
+    from sklearn.metrics import roc_auc_score
+
+    rng = np.random.RandomState(1994)
+    n_features = 10
+    n_classes = 4
+
+    X, y = make_classification(
+        n_samples,
+        n_features,
+        n_informative=n_features,
+        n_redundant=0,
+        n_classes=n_classes,
+        random_state=rng,
+    )
+    if weighted:
+        weights = rng.randn(n_samples)
+        weights -= weights.min()
+        weights /= weights.max()
+    else:
+        weights = None
+
+    Xy = DMatrix(X, y, weight=weights)
+    booster = train(
+        {
+            "tree_method": tree_method,
+            "eval_metric": "auc",
+            "objective": "multi:softprob",
+            "num_class": n_classes,
+            "device": device,
+        },
+        Xy,
+        num_boost_round=1,
+    )
+    score = booster.predict(Xy)
+    skl_auc = roc_auc_score(
+        y, score, average="weighted", sample_weight=weights, multi_class="ovr"
+    )
+    auc = float(booster.eval(Xy).split(":")[1])
+    np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
+
+    X = rng.randn(*X.shape)
+
+    score = booster.predict(DMatrix(X, weight=weights))
+    skl_auc = roc_auc_score(
+        y, score, average="weighted", sample_weight=weights, multi_class="ovr"
+    )
+    auc = float(booster.eval(DMatrix(X, y, weight=weights)).split(":")[1])
+    np.testing.assert_allclose(skl_auc, auc, rtol=1e-5)
+
+
+def run_pr_auc_ltr(tree_method: str, device: Device) -> None:
+    """Test for PR AUC metric on a ranking problem."""
+    from sklearn.datasets import make_classification
+
+    X, y = make_classification(128, 4, n_classes=2, random_state=1994)
+    ltr = XGBRanker(
+        tree_method=tree_method,
+        n_estimators=16,
+        objective="rank:pairwise",
+        eval_metric="aucpr",
+        device=device,
+    )
+    groups = np.array([32, 32, 64])
+    ltr.fit(
+        X,
+        y,
+        group=groups,
+        eval_set=[(X, y)],
+        eval_group=[groups],
+    )
+    results = ltr.evals_result()["validation_0"]["aucpr"]
+    assert results[-1] >= 0.99
+
+
+def run_pr_auc_binary(tree_method: str, device: Device) -> None:
+    """Test for PR AUC metric on a binary classification problem."""
+    from sklearn.datasets import make_classification
+    from sklearn.metrics import auc, precision_recall_curve
+
+    X, y = make_classification(128, 4, n_classes=2, random_state=1994)
+    clf = XGBClassifier(
+        tree_method=tree_method, n_estimators=1, eval_metric="aucpr", device=device
+    )
+    clf.fit(X, y, eval_set=[(X, y)])
+    evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
+
+    y_score = clf.predict_proba(X)[:, 1]  # get the positive column
+    precision, recall, _ = precision_recall_curve(y, y_score)
+    prauc = auc(recall, precision)
+    # Interpolation results are slightly different from sklearn, but overall should
+    # be similar.
+    np.testing.assert_allclose(prauc, evals_result, rtol=1e-2)
+
+    clf = XGBClassifier(
+        tree_method=tree_method, n_estimators=10, eval_metric="aucpr", device=device
+    )
+    clf.fit(X, y, eval_set=[(X, y)])
+    evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
+    np.testing.assert_allclose(0.99, evals_result, rtol=1e-2)
diff --git a/python-package/xgboost/testing/monotone_constraints.py b/python-package/xgboost/testing/monotone_constraints.py
new file mode 100644
index 000000000000..36db20b48e02
--- /dev/null
+++ b/python-package/xgboost/testing/monotone_constraints.py
@@ -0,0 +1,65 @@
+"""Helpers for testing monotone constraints."""
+
+from typing import Optional
+
+import numpy as np
+
+from .._typing import FeatureNames
+from ..core import Booster, DMatrix
+
+
+def is_increasing(v: np.ndarray) -> bool:
+    """Whether is v increasing."""
+    return np.count_nonzero(np.diff(v) < 0.0) == 0
+
+
+def is_decreasing(v: np.ndarray) -> bool:
+    """Whether is v decreasing."""
+    return np.count_nonzero(np.diff(v) > 0.0) == 0
+
+
+def is_correctly_constrained(
+    learner: Booster, feature_names: Optional[FeatureNames] = None
+) -> bool:
+    """Whether the monotone constraint is correctly applied."""
+    n = 100
+    variable_x = np.linspace(0, 1, n).reshape((n, 1))
+    fixed_xs_values = np.linspace(0, 1, n)
+
+    for i in range(n):
+        fixed_x = fixed_xs_values[i] * np.ones((n, 1))
+        monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
+        monotonically_increasing_dset = DMatrix(
+            monotonically_increasing_x, feature_names=feature_names
+        )
+        monotonically_increasing_y = learner.predict(monotonically_increasing_dset)
+
+        monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
+        monotonically_decreasing_dset = DMatrix(
+            monotonically_decreasing_x, feature_names=feature_names
+        )
+        monotonically_decreasing_y = learner.predict(monotonically_decreasing_dset)
+
+        if not (
+            is_increasing(monotonically_increasing_y)
+            and is_decreasing(monotonically_decreasing_y)
+        ):
+            return False
+
+    return True
+
+
+NUMBER_OF_DPOINTS = 1000
+x1_positively_correlated_with_y = np.random.random(size=NUMBER_OF_DPOINTS)
+x2_negatively_correlated_with_y = np.random.random(size=NUMBER_OF_DPOINTS)
+
+x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y))
+zs = np.random.normal(loc=0.0, scale=0.01, size=NUMBER_OF_DPOINTS)
+y = (
+    5 * x1_positively_correlated_with_y
+    + np.sin(10 * np.pi * x1_positively_correlated_with_y)
+    - 5 * x2_negatively_correlated_with_y
+    - np.cos(10 * np.pi * x2_negatively_correlated_with_y)
+    + zs
+)
+training_dset = DMatrix(x, label=y)
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 69893ab43fef..9d5afc4eed31 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -472,12 +472,13 @@ def run_specified_cat(  # pylint: disable=too-many-locals
     # make sure we cover all the cases.
     n_features = 4096
     n_samples = 1024
-    df = pd.DataFrame()
+
     col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
     col_categorical = rng.integers(
         low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
     )
 
+    df = {}  # avoid fragmentation warning from pandas
     for c in range(n_features):
         if c % 2 == 0:
             col = col_numeric[:, c // 2]
diff --git a/python-package/xgboost/testing/parse_tree.py b/python-package/xgboost/testing/parse_tree.py
new file mode 100644
index 000000000000..15935fd512c8
--- /dev/null
+++ b/python-package/xgboost/testing/parse_tree.py
@@ -0,0 +1,32 @@
+"""Tests for parsing trees."""
+
+import pytest
+
+from ..core import DMatrix
+from ..sklearn import XGBRegressor
+from ..training import train
+from .data import make_categorical
+from .utils import Device
+
+
+def run_tree_to_df_categorical(tree_method: str, device: Device) -> None:
+    """Tests tree_to_df with categorical features."""
+    X, y = make_categorical(100, 10, 31, onehot=False)
+    Xy = DMatrix(X, y, enable_categorical=True)
+    booster = train(
+        {"tree_method": tree_method, "device": device}, Xy, num_boost_round=10
+    )
+    df = booster.trees_to_dataframe()
+    for _, x in df.iterrows():
+        if x["Feature"] != "Leaf":
+            assert len(x["Category"]) >= 1
+
+
+def run_split_value_histograms(tree_method: str, device: Device) -> None:
+    """Tests split_value_histograms with categorical features."""
+    X, y = make_categorical(1000, 10, 13, onehot=False)
+    reg = XGBRegressor(tree_method=tree_method, enable_categorical=True, device=device)
+    reg.fit(X, y)
+
+    with pytest.raises(ValueError, match="doesn't"):
+        reg.get_booster().get_split_value_histogram("3", bins=5)
diff --git a/python-package/xgboost/testing/plotting.py b/python-package/xgboost/testing/plotting.py
new file mode 100644
index 000000000000..9966cf2c6c92
--- /dev/null
+++ b/python-package/xgboost/testing/plotting.py
@@ -0,0 +1,29 @@
+"""Test plotting functions for XGBoost."""
+
+import json
+
+from graphviz import Source
+from matplotlib.axes import Axes
+
+from ..plotting import plot_tree, to_graphviz
+from ..sklearn import XGBRegressor
+from .data import make_categorical
+from .utils import Device
+
+
+def run_categorical(tree_method: str, device: Device) -> None:
+    """Tests plotting functions for categorical features."""
+    X, y = make_categorical(1000, 31, 19, onehot=False)
+    reg = XGBRegressor(
+        enable_categorical=True, n_estimators=10, tree_method=tree_method, device=device
+    )
+    reg.fit(X, y)
+    trees = reg.get_booster().get_dump(dump_format="json")
+    for tree in trees:
+        j_tree = json.loads(tree)
+        assert "leaf" in j_tree.keys() or isinstance(j_tree["split_condition"], list)
+
+    graph = to_graphviz(reg, tree_idx=len(j_tree) - 1)
+    assert isinstance(graph, Source)
+    ax = plot_tree(reg, tree_idx=len(j_tree) - 1)
+    assert isinstance(ax, Axes)
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
index 588c210750c8..af7ee067a6bb 100644
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -9,8 +9,10 @@
 import xgboost as xgb
 from xgboost import testing as tm
 
+from .utils import Device
 
-def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
+
+def run_ranking_qid_df(impl: ModuleType, tree_method: str, device: Device) -> None:
     """Test ranking with qid packed into X."""
     import scipy.sparse
     from sklearn.metrics import mean_squared_error
@@ -21,7 +23,9 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
     # pack qid into x using dataframe
     df = impl.DataFrame(X)
     df["qid"] = q
-    ranker = xgb.XGBRanker(n_estimators=3, eval_metric="ndcg", tree_method=tree_method)
+    ranker = xgb.XGBRanker(
+        n_estimators=3, eval_metric="ndcg", tree_method=tree_method, device=device
+    )
     ranker.fit(df, y)
     s = ranker.score(df, y)
     assert s > 0.7
@@ -32,13 +36,15 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
     ranker.fit(df, y, eval_set=[(valid_df, y)])
 
     # same as passing qid directly
-    ranker = xgb.XGBRanker(n_estimators=3, eval_metric="ndcg", tree_method=tree_method)
+    ranker = xgb.XGBRanker(
+        n_estimators=3, eval_metric="ndcg", tree_method=tree_method, device=device
+    )
     ranker.fit(X, y, qid=q)
     s1 = ranker.score(df, y)
     assert np.isclose(s, s1)
 
     # Works with standard sklearn cv
-    if tree_method != "gpu_hist":
+    if device == "cpu":
         # we need cuML for this.
         kfold = StratifiedGroupKFold(shuffle=False)
         results = cross_val_score(ranker, df, y, cv=kfold, groups=df.qid)
@@ -52,6 +58,7 @@ def neg_mse(*args: Any, **kwargs: Any) -> float:
         n_estimators=3,
         eval_metric=neg_mse,
         tree_method=tree_method,
+        device=device,
         disable_default_eval_metric=True,
     )
     ranker.fit(df, y, eval_set=[(valid_df, y)])
@@ -59,7 +66,7 @@ def neg_mse(*args: Any, **kwargs: Any) -> float:
     assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])
 
     # Works with sparse data
-    if tree_method != "gpu_hist":
+    if device == "cpu":
         # no sparse with cuDF
         X_csr = scipy.sparse.csr_matrix(X)
         df = impl.DataFrame.sparse.from_spmatrix(
@@ -67,7 +74,10 @@ def neg_mse(*args: Any, **kwargs: Any) -> float:
         )
         df["qid"] = q
         ranker = xgb.XGBRanker(
-            n_estimators=3, eval_metric="ndcg", tree_method=tree_method
+            n_estimators=3,
+            eval_metric="ndcg",
+            tree_method=tree_method,
+            device=device,
         )
         ranker.fit(df, y)
         s2 = ranker.score(df, y)
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index e98ca471e188..acd53116b7cf 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -3,7 +3,7 @@
 import json
 from functools import partial, update_wrapper
 from string import ascii_lowercase
-from typing import Any, Dict, List, Literal, TypeAlias, Union, overload
+from typing import Any, Dict, List, Union, overload
 
 import numpy as np
 import pytest
@@ -14,8 +14,7 @@
 
 from ..core import DataIter
 from .data_iter import CatIter
-
-Device: TypeAlias = Literal["cpu", "cuda"]
+from .utils import Device
 
 
 @overload
@@ -37,7 +36,7 @@ def get_basescore(model: Union[xgb.XGBModel, xgb.Booster]) -> float:
     return base_score
 
 
-def check_init_estimation(tree_method: str) -> None:
+def check_init_estimation(tree_method: str, device: Device) -> None:
     """Test for init estimation."""
     from sklearn.datasets import (
         make_classification,
@@ -46,13 +45,19 @@ def check_init_estimation(tree_method: str) -> None:
     )
 
     def run_reg(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-name
-        reg = xgb.XGBRegressor(tree_method=tree_method, max_depth=1, n_estimators=1)
+        reg = xgb.XGBRegressor(
+            tree_method=tree_method, max_depth=1, n_estimators=1, device=device
+        )
         reg.fit(X, y, eval_set=[(X, y)])
         base_score_0 = get_basescore(reg)
         score_0 = reg.evals_result()["validation_0"]["rmse"][0]
 
         reg = xgb.XGBRegressor(
-            tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
+            tree_method=tree_method,
+            device=device,
+            max_depth=1,
+            n_estimators=1,
+            boost_from_average=0,
         )
         reg.fit(X, y, eval_set=[(X, y)])
         base_score_1 = get_basescore(reg)
@@ -68,13 +73,19 @@ def run_reg(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-na
     run_reg(X, y)
 
     def run_clf(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-name
-        clf = xgb.XGBClassifier(tree_method=tree_method, max_depth=1, n_estimators=1)
+        clf = xgb.XGBClassifier(
+            tree_method=tree_method, max_depth=1, n_estimators=1, device=device
+        )
         clf.fit(X, y, eval_set=[(X, y)])
         base_score_0 = get_basescore(clf)
         score_0 = clf.evals_result()["validation_0"]["logloss"][0]
 
         clf = xgb.XGBClassifier(
-            tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
+            tree_method=tree_method,
+            max_depth=1,
+            n_estimators=1,
+            device=device,
+            boost_from_average=0,
         )
         clf.fit(X, y, eval_set=[(X, y)])
         base_score_1 = get_basescore(clf)
@@ -92,7 +103,7 @@ def run_clf(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-na
 
 
 # pylint: disable=too-many-locals
-def check_quantile_loss(tree_method: str, weighted: bool) -> None:
+def check_quantile_loss(tree_method: str, weighted: bool, device: Device) -> None:
     """Test for quantile loss."""
     from sklearn.datasets import make_regression
     from sklearn.metrics import mean_pinball_loss
@@ -125,6 +136,7 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:
         {
             "objective": "reg:quantileerror",
             "tree_method": tree_method,
+            "device": device,
             "quantile_alpha": alpha,
             "base_score": base_score,
         },
@@ -156,6 +168,7 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:
             {
                 "objective": "reg:quantileerror",
                 "tree_method": tree_method,
+                "device": device,
                 "quantile_alpha": a,
                 "base_score": base_score,
             },
diff --git a/python-package/xgboost/testing/utils.py b/python-package/xgboost/testing/utils.py
new file mode 100644
index 000000000000..921173f50023
--- /dev/null
+++ b/python-package/xgboost/testing/utils.py
@@ -0,0 +1,5 @@
+"""Helpers for test code."""
+
+from typing import Literal, TypeAlias
+
+Device: TypeAlias = Literal["cpu", "cuda"]
diff --git a/python-package/xgboost/testing/with_skl.py b/python-package/xgboost/testing/with_skl.py
new file mode 100644
index 000000000000..b9cdfdd518cf
--- /dev/null
+++ b/python-package/xgboost/testing/with_skl.py
@@ -0,0 +1,134 @@
+# pylint: disable=invalid-name, too-many-arguments, too-many-positional-arguments
+"""Tests for compatiblity with sklearn."""
+
+from typing import Callable, Optional, Type
+
+import numpy as np
+import pytest
+
+from ..core import DMatrix
+from ..sklearn import XGBClassifier, XGBRFRegressor
+from .utils import Device
+
+
+def run_boost_from_prediction_binary(
+    tree_method: str,
+    device: Device,
+    X: np.ndarray,
+    y: np.ndarray,
+    as_frame: Optional[Callable],
+) -> None:
+    """
+    Parameters
+    ----------
+
+    as_frame: A callable function to convert margin into DataFrame, useful for different
+    df implementations.
+    """
+
+    model_0 = XGBClassifier(
+        learning_rate=0.3,
+        random_state=0,
+        n_estimators=4,
+        tree_method=tree_method,
+        device=device,
+    )
+    model_0.fit(X=X, y=y)
+    margin = model_0.predict(X, output_margin=True)
+    if as_frame is not None:
+        margin = as_frame(margin)
+
+    model_1 = XGBClassifier(
+        learning_rate=0.3,
+        random_state=0,
+        n_estimators=4,
+        tree_method=tree_method,
+        device=device,
+    )
+    model_1.fit(X=X, y=y, base_margin=margin)
+    predictions_1 = model_1.predict(X, base_margin=margin)
+
+    cls_2 = XGBClassifier(
+        learning_rate=0.3,
+        random_state=0,
+        n_estimators=8,
+        tree_method=tree_method,
+        device=device,
+    )
+    cls_2.fit(X=X, y=y)
+    predictions_2 = cls_2.predict(X)
+    np.testing.assert_allclose(predictions_1, predictions_2)
+
+
+def run_boost_from_prediction_multi_clasas(
+    estimator: Type,
+    tree_method: str,
+    device: Device,
+    X: np.ndarray,
+    y: np.ndarray,
+    as_frame: Optional[Callable],
+) -> None:
+    """Boosting from prediction with multi-class clf."""
+    # Multi-class
+    model_0 = estimator(
+        learning_rate=0.3,
+        random_state=0,
+        n_estimators=4,
+        tree_method=tree_method,
+        device=device,
+    )
+    model_0.fit(X=X, y=y)
+    margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
+    if as_frame is not None:
+        margin = as_frame(margin)
+
+    model_1 = estimator(
+        learning_rate=0.3,
+        random_state=0,
+        n_estimators=4,
+        tree_method=tree_method,
+        device=device,
+    )
+    model_1.fit(X=X, y=y, base_margin=margin)
+    predictions_1 = model_1.get_booster().predict(
+        DMatrix(X, base_margin=margin), output_margin=True
+    )
+
+    model_2 = estimator(
+        learning_rate=0.3,
+        random_state=0,
+        n_estimators=8,
+        tree_method=tree_method,
+        device=device,
+    )
+    model_2.fit(X=X, y=y)
+    predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")
+
+    if hasattr(predictions_1, "get"):
+        predictions_1 = predictions_1.get()
+    if hasattr(predictions_2, "get"):
+        predictions_2 = predictions_2.get()
+    np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)
+
+
+def run_housing_rf_regression(tree_method: str, device: Device) -> None:
+    """Testwith the cali housing dataset."""
+    from sklearn.datasets import fetch_california_housing
+    from sklearn.metrics import mean_squared_error
+    from sklearn.model_selection import KFold
+
+    X, y = fetch_california_housing(return_X_y=True)
+    rng = np.random.RandomState(1994)
+    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
+    for train_index, test_index in kf.split(X, y):
+        xgb_model = XGBRFRegressor(
+            random_state=42, tree_method=tree_method, device=device
+        ).fit(X[train_index], y[train_index])
+        preds = xgb_model.predict(X[test_index])
+        labels = y[test_index]
+        assert mean_squared_error(preds, labels) < 35
+
+    rfreg = XGBRFRegressor(device=device)
+    with pytest.raises(NotImplementedError):
+        rfreg.set_params(early_stopping_rounds=10)
+        rfreg.fit(X, y)
diff --git a/src/context.cc b/src/context.cc
index 7cd1e7c0013b..634c593cf3bc 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -200,7 +200,8 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
     device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
     if (!device.IsCUDA()) {
       // We allow loading a GPU-based pickle on a CPU-only machine.
-      LOG(WARNING) << "XGBoost is not compiled with CUDA support.";
+      LOG(WARNING) << "Device is changed from GPU to CPU as we couldn't find any available GPU on "
+                      "the system.";
     }
   }
   return device;
diff --git a/tests/python-gpu/load_pickle.py b/tests/python-gpu/load_pickle.py
index 402470d99720..6853f4b4346d 100644
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@@ -9,7 +9,6 @@
 from test_gpu_pickling import build_dataset, load_pickle, model_path
 
 import xgboost as xgb
-from xgboost import testing as tm
 
 
 class TestLoadPickle:
@@ -63,6 +62,4 @@ def test_training_on_cpu_only_env(self) -> None:
         X = rng.randn(10, 10)
         y = rng.randn(10)
         with pytest.warns(UserWarning, match="No visible GPU is found"):
-            # Test no thrust exception is thrown
-            with pytest.raises(xgb.core.XGBoostError, match="have at least one device"):
-                xgb.train({"tree_method": "gpu_hist"}, xgb.DMatrix(X, y))
+            xgb.train({"device": "cuda"}, xgb.DMatrix(X, y))
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 5e3bd79625e5..4749970f58c9 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -47,10 +47,10 @@ def test_dmatrix_cupy_init(self) -> None:
         xgb.QuantileDMatrix(data, cp.ones(5, dtype=np.float64))
 
     @pytest.mark.parametrize(
-        "on_device,tree_method",
-        [(True, "hist"), (False, "gpu_hist"), (False, "hist"), (True, "gpu_hist")],
+        "on_device,device",
+        [(True, "cpu"), (False, "cuda"), (False, "cpu"), (True, "cuda")],
     )
-    def test_initialization(self, on_device: bool, tree_method: str) -> None:
+    def test_initialization(self, on_device: bool, device: str) -> None:
         n_samples, n_features, max_bin = 64, 3, 16
         X, y, w = tm.make_batches(
             n_samples,
@@ -59,11 +59,12 @@ def test_initialization(self, on_device: bool, tree_method: str) -> None:
             use_cupy=on_device,
         )
 
+        tree_method = "hist"
         # Init SparsePage
         Xy = xgb.DMatrix(X[0], y[0], weight=w[0])
         # Init GIDX/Ellpack
         xgb.train(
-            {"tree_method": tree_method, "max_bin": max_bin},
+            {"tree_method": tree_method, "max_bin": max_bin, "device": device},
             Xy,
             num_boost_round=1,
         )
@@ -76,7 +77,7 @@ def test_initialization(self, on_device: bool, tree_method: str) -> None:
 
         # No error, DMatrix can be modified for different training session.
         xgb.train(
-            {"tree_method": tree_method, "max_bin": max_bin - 1},
+            {"tree_method": tree_method, "max_bin": max_bin - 1, "device": device},
             Xy,
             num_boost_round=1,
         )
@@ -85,7 +86,7 @@ def test_initialization(self, on_device: bool, tree_method: str) -> None:
         Xy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin)
         # Init GIDX/Ellpack
         xgb.train(
-            {"tree_method": tree_method, "max_bin": max_bin},
+            {"tree_method": tree_method, "max_bin": max_bin, "device": device},
             Xy,
             num_boost_round=1,
         )
@@ -98,12 +99,22 @@ def test_initialization(self, on_device: bool, tree_method: str) -> None:
 
         Xy = xgb.DMatrix(X[0], y[0], weight=w[0])
         booster0 = xgb.train(
-            {"tree_method": "hist", "max_bin": max_bin, "max_depth": 4},
+            {
+                "tree_method": "hist",
+                "max_bin": max_bin,
+                "max_depth": 4,
+                "device": "cpu",
+            },
             Xy,
             num_boost_round=1,
         )
         booster1 = xgb.train(
-            {"tree_method": "gpu_hist", "max_bin": max_bin, "max_depth": 4},
+            {
+                "tree_method": "hist",
+                "max_bin": max_bin,
+                "max_depth": 4,
+                "device": "cuda",
+            },
             Xy,
             num_boost_round=1,
         )
@@ -114,10 +125,10 @@ def test_initialization(self, on_device: bool, tree_method: str) -> None:
 
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.parametrize(
-        "tree_method,max_bin",
-        [("hist", 16), ("gpu_hist", 16), ("hist", 64), ("gpu_hist", 64)],
+        "device,max_bin",
+        [("cpu", 16), ("cuda", 16), ("cpu", 64), ("cuda", 64)],
     )
-    def test_interoperability(self, tree_method: str, max_bin: int) -> None:
+    def test_interoperability(self, device: str, max_bin: int) -> None:
         import cupy as cp
 
         n_samples = 64
@@ -128,7 +139,7 @@ def test_interoperability(self, tree_method: str, max_bin: int) -> None:
         # from CPU
         Xy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin)
         booster_0 = xgb.train(
-            {"tree_method": tree_method, "max_bin": max_bin}, Xy, num_boost_round=4
+            {"device": device, "max_bin": max_bin}, Xy, num_boost_round=4
         )
 
         X[0] = cp.array(X[0])
@@ -138,7 +149,7 @@ def test_interoperability(self, tree_method: str, max_bin: int) -> None:
         # from GPU
         Xy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin)
         booster_1 = xgb.train(
-            {"tree_method": tree_method, "max_bin": max_bin}, Xy, num_boost_round=4
+            {"device": device, "max_bin": max_bin}, Xy, num_boost_round=4
         )
         cp.testing.assert_allclose(
             booster_0.inplace_predict(X[0]), booster_1.inplace_predict(X[0])
@@ -146,7 +157,9 @@ def test_interoperability(self, tree_method: str, max_bin: int) -> None:
 
         with pytest.raises(ValueError, match=r"Only.*hist.*"):
             xgb.train(
-                {"tree_method": "approx", "max_bin": max_bin}, Xy, num_boost_round=4
+                {"tree_method": "approx", "max_bin": max_bin, "device": device},
+                Xy,
+                num_boost_round=4,
             )
 
     def test_ref_quantile_cut(self) -> None:
@@ -229,7 +242,7 @@ def test_ltr(self) -> None:
         cpX = cp.array(X)
         Xy_qdm = xgb.QuantileDMatrix(cpX, y, qid=qid, weight=w)
         Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
-        xgb.train({"tree_method": "gpu_hist", "objective": "rank:ndcg"}, Xy)
+        xgb.train({"device": "cuda", "objective": "rank:ndcg"}, Xy)
 
         from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
         from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py
index 8b74c79715d9..073b0de53468 100644
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -1,26 +1,16 @@
 import os
-import sys
+from typing import Tuple
 
 import numpy as np
 import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
-
-sys.path.append("tests/python")
-import test_basic_models as test_bm
-
-# Don't import the test class, otherwise they will run twice.
-import test_callback as test_cb  # noqa
-
-rng = np.random.RandomState(1994)
+from xgboost.testing.basic_models import run_custom_objective
 
 
 class TestGPUBasicModels:
-    cpu_test_cb = test_cb.TestCallbacks()
-    cpu_test_bm = test_bm.TestModels()
-
-    def run_cls(self, X, y):
+    def run_cls(self, X: np.ndarray, y: np.ndarray) -> Tuple[int, int]:
         cls = xgb.XGBClassifier(tree_method="hist", device="cuda")
         cls.fit(X, y)
         cls.get_booster().save_model("test_deterministic_gpu_hist-0.json")
@@ -39,19 +29,11 @@ def run_cls(self, X, y):
 
         return hash(model_0), hash(model_1)
 
-    def test_custom_objective(self):
-        self.cpu_test_bm.run_custom_objective("gpu_hist")
+    def test_custom_objective(self) -> None:
+        dtrain, dtest = tm.load_agaricus(__file__)
+        run_custom_objective("hist", "cuda", dtrain, dtest)
 
-    def test_eta_decay(self):
-        self.cpu_test_cb.run_eta_decay("gpu_hist")
-
-    @pytest.mark.parametrize(
-        "objective", ["binary:logistic", "reg:absoluteerror", "reg:quantileerror"]
-    )
-    def test_eta_decay_leaf_output(self, objective) -> None:
-        self.cpu_test_cb.run_eta_decay_leaf_output("gpu_hist", objective)
-
-    def test_deterministic_gpu_hist(self):
+    def test_deterministic_gpu_hist(self) -> None:
         kRows = 1000
         kCols = 64
         kClasses = 4
@@ -63,16 +45,16 @@ def test_deterministic_gpu_hist(self):
         assert model_0 == model_1
 
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_invalid_gpu_id(self):
+    def test_invalid_gpu_id(self) -> None:
         from sklearn.datasets import load_digits
 
         X, y = load_digits(return_X_y=True)
         # should pass with invalid gpu id
-        cls1 = xgb.XGBClassifier(tree_method="gpu_hist", gpu_id=9999)
+        cls1 = xgb.XGBClassifier(tree_method="hist", device="cuda:9999")
         cls1.fit(X, y)
         # should throw error with fail_on_invalid_gpu_id enabled
         cls2 = xgb.XGBClassifier(
-            tree_method="gpu_hist", gpu_id=9999, fail_on_invalid_gpu_id=True
+            tree_method="hist", device="cuda:9999", fail_on_invalid_gpu_id=True
         )
         with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
             cls2.fit(X, y)
diff --git a/tests/python-gpu/test_gpu_callbacks.py b/tests/python-gpu/test_gpu_callbacks.py
new file mode 100644
index 000000000000..6f1f0081bf56
--- /dev/null
+++ b/tests/python-gpu/test_gpu_callbacks.py
@@ -0,0 +1,20 @@
+import pytest
+
+from xgboost import testing as tm
+from xgboost.testing.callbacks import (
+    run_eta_decay,
+    run_eta_decay_leaf_output,
+    tree_methods_objs,
+)
+
+
+@pytest.mark.parametrize("tree_method", ["approx", "hist"])
+def test_eta_decay(tree_method: str) -> None:
+    dtrain, dtest = tm.load_agaricus(__file__)
+    run_eta_decay(tree_method, dtrain, dtest, "cuda")
+
+
+@pytest.mark.parametrize("tree_method,objective", tree_methods_objs())
+def test_eta_decay_leaf_output(tree_method: str, objective: str) -> None:
+    dtrain, dtest = tm.load_agaricus(__file__)
+    run_eta_decay_leaf_output(tree_method, objective, dtrain, dtest, "cuda")
diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py
index f084eaa45253..d44b117d3c3f 100644
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -1,28 +1,30 @@
 import json
-import sys
 
 import pytest
 
 import xgboost
 from xgboost import testing as tm
-from xgboost.testing.metrics import check_precision_score, check_quantile_error
-
-sys.path.append("tests/python")
-import test_eval_metrics as test_em  # noqa
+from xgboost.testing.metrics import (
+    check_precision_score,
+    check_quantile_error,
+    run_pr_auc_binary,
+    run_pr_auc_ltr,
+    run_pr_auc_multi,
+    run_roc_auc_binary,
+    run_roc_auc_multi,
+)
 
 
 class TestGPUEvalMetrics:
-    cpu_test = test_em.TestEvalMetrics()
-
     @pytest.mark.parametrize("n_samples", [4, 100, 1000])
-    def test_roc_auc_binary(self, n_samples):
-        self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples)
+    def test_roc_auc_binary(self, n_samples: int) -> None:
+        run_roc_auc_binary("hist", n_samples, "cuda")
 
     @pytest.mark.parametrize(
         "n_samples,weighted", [(4, False), (100, False), (1000, False), (1000, True)]
     )
-    def test_roc_auc_multi(self, n_samples, weighted):
-        self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples, weighted)
+    def test_roc_auc_multi(self, n_samples: int, weighted: bool) -> None:
+        run_roc_auc_multi("hist", n_samples, weighted, "cuda")
 
     @pytest.mark.parametrize("n_samples", [4, 100, 1000])
     def test_roc_auc_ltr(self, n_samples):
@@ -56,18 +58,18 @@ def test_roc_auc_ltr(self, n_samples):
 
         np.testing.assert_allclose(cpu_auc, gpu_auc)
 
-    def test_pr_auc_binary(self):
-        self.cpu_test.run_pr_auc_binary("gpu_hist")
+    def test_pr_auc_binary(self) -> None:
+        run_pr_auc_binary("hist", "cuda")
 
-    def test_pr_auc_multi(self):
-        self.cpu_test.run_pr_auc_multi("gpu_hist")
+    def test_pr_auc_multi(self) -> None:
+        run_pr_auc_multi("hist", "cuda")
 
-    def test_pr_auc_ltr(self):
-        self.cpu_test.run_pr_auc_ltr("gpu_hist")
+    def test_pr_auc_ltr(self) -> None:
+        run_pr_auc_ltr("hist", "cuda")
 
-    def test_precision_score(self):
-        check_precision_score("gpu_hist")
+    def test_precision_score(self) -> None:
+        check_precision_score("hist", "cuda")
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_quantile_error(self) -> None:
-        check_quantile_error("gpu_hist")
+        check_quantile_error("hist", "cuda")
diff --git a/tests/python-gpu/test_gpu_interaction_constraints.py b/tests/python-gpu/test_gpu_interaction_constraints.py
index 434cc15dacd5..e9f31671d1e1 100644
--- a/tests/python-gpu/test_gpu_interaction_constraints.py
+++ b/tests/python-gpu/test_gpu_interaction_constraints.py
@@ -1,25 +1,23 @@
-import sys
-
 import numpy as np
 import pandas as pd
+import pytest
 
 import xgboost as xgb
-
-sys.path.append("tests/python")
-# Don't import the test class, otherwise they will run twice.
-import test_interaction_constraints as test_ic  # noqa
-
-rng = np.random.RandomState(1994)
+from xgboost.testing.interaction_constraints import (
+    run_interaction_constraints,
+    training_accuracy,
+)
 
 
 class TestGPUInteractionConstraints:
-    cputest = test_ic.TestInteractionConstraints()
-
-    def test_interaction_constraints(self):
-        self.cputest.run_interaction_constraints(tree_method="gpu_hist")
+    @pytest.mark.parametrize("tree_method", ["hist", "approx"])
+    def test_interaction_constraints(self, tree_method: str) -> None:
+        run_interaction_constraints(tree_method=tree_method, device="cuda")
 
-    def test_training_accuracy(self):
-        self.cputest.training_accuracy(tree_method="gpu_hist")
+    @pytest.mark.parametrize("tree_method", ["hist", "approx"])
+    def test_training_accuracy(self, tree_method: str) -> None:
+        dpath = "demo/data/"
+        training_accuracy(tree_method=tree_method, dpath=dpath, device="cuda")
 
     # case where different number of features can occur in the evaluator
     def test_issue_8730(self):
@@ -39,7 +37,8 @@ def test_issue_8730(self):
             "lambda": 0.14943712232059794,
             "grow_policy": "depthwise",
             "max_depth": 3,
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
+            "device": "cuda",
             "interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]],
             "objective": "count:poisson",
             "eval_metric": "poisson-nloglik",
diff --git a/tests/python-gpu/test_gpu_parse_tree.py b/tests/python-gpu/test_gpu_parse_tree.py
index 1c55acc8f33a..d46dbf23d108 100644
--- a/tests/python-gpu/test_gpu_parse_tree.py
+++ b/tests/python-gpu/test_gpu_parse_tree.py
@@ -1,14 +1,12 @@
-import sys
+from xgboost.testing.parse_tree import (
+    run_split_value_histograms,
+    run_tree_to_df_categorical,
+)
 
-sys.path.append("tests/python")
-from test_parse_tree import TestTreesToDataFrame
 
+def test_tree_to_df_categorical() -> None:
+    run_tree_to_df_categorical("hist", "cuda")
 
-def test_tree_to_df_categorical():
-    cputest = TestTreesToDataFrame()
-    cputest.run_tree_to_df_categorical("gpu_hist")
 
-
-def test_split_value_histograms():
-    cputest = TestTreesToDataFrame()
-    cputest.run_split_value_histograms("gpu_hist")
+def test_split_value_histograms() -> None:
+    run_split_value_histograms("hist", "cuda")
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
index 217705d0ee83..e35842d0e619 100644
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -91,7 +91,7 @@ def test_wrap_gpu_id(self):
         dtrain = xgb.DMatrix(X, y)
 
         bst = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": 1}, dtrain, num_boost_round=6
+            {"tree_method": "hist", "device": "cuda:1"}, dtrain, num_boost_round=6
         )
 
         model_path = "model.pkl"
@@ -111,7 +111,7 @@ def test_pickled_context(self):
         x, y = tm.make_sparse_regression(10, 10, sparsity=0.8, as_dense=True)
         train_x = xgb.DMatrix(x, label=y)
 
-        param = {"tree_method": "gpu_hist", "verbosity": 1}
+        param = {"tree_method": "hist", "verbosity": 1, "device": "cuda"}
         bst = xgb.train(param, train_x)
 
         save_pickle(bst, model_path)
@@ -150,9 +150,9 @@ def test_predict_sklearn_pickle(self) -> None:
         x, y = load_digits(return_X_y=True)
 
         kwargs = {
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
             "objective": "binary:logistic",
-            "gpu_id": 0,
+            "device": "cuda",
             "n_estimators": 10,
         }
 
diff --git a/tests/python-gpu/test_gpu_plotting.py b/tests/python-gpu/test_gpu_plotting.py
index 22b3b41fc34b..af29dc0201e3 100644
--- a/tests/python-gpu/test_gpu_plotting.py
+++ b/tests/python-gpu/test_gpu_plotting.py
@@ -1,18 +1,11 @@
-import sys
-
 import pytest
 
 from xgboost import testing as tm
 
-sys.path.append("tests/python")
-import test_plotting as tp
-
-pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz()))
-
 
 class TestPlotting:
-    cputest = tp.TestPlotting()
+    @pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz()))
+    def test_categorical(self) -> None:
+        from xgboost.testing.plotting import run_categorical
 
-    @pytest.mark.skipif(**tm.no_pandas())
-    def test_categorical(self):
-        self.cputest.run_categorical("gpu_hist")
+        run_categorical("hist", "cuda")
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index f24da20374be..8d793756d9f1 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -28,8 +28,8 @@ def comp_training_with_rank_objective(
 
     params = {
         "booster": "gbtree",
-        "tree_method": "gpu_hist",
-        "gpu_id": 0,
+        "tree_method": "hist",
+        "device": "cuda",
     }
 
     num_trees = 100
@@ -53,7 +53,7 @@ def comp_training_with_rank_objective(
     cpu_params = {
         "booster": "gbtree",
         "tree_method": "hist",
-        "gpu_id": -1,
+        "device": "cpu",
     }
     cpu_params["objective"] = rank_objective
     cpu_params["eval_metric"] = metric_name
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 9a4778685f40..25e6eb1a4a7e 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -337,11 +337,11 @@ def test_adaptive(self, tree_method: str, weighted: bool) -> None:
         run_adaptive(tree_method, weighted, "cuda")
 
     def test_init_estimation(self) -> None:
-        check_init_estimation("gpu_hist")
+        check_init_estimation("hist", "cuda")
 
     @pytest.mark.parametrize("weighted", [True, False])
     def test_quantile_loss(self, weighted: bool) -> None:
-        check_quantile_loss("gpu_hist", weighted)
+        check_quantile_loss("hist", weighted, "cuda")
 
     @pytest.mark.skipif(**tm.no_pandas())
     def test_issue8824(self):
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index 0e5ac2f6d0a7..a7be2154fd85 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -1,7 +1,6 @@
 import itertools
 import json
 import os
-import sys
 import tempfile
 from concurrent.futures import ThreadPoolExecutor
 
@@ -11,9 +10,11 @@
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
-
-sys.path.append("tests/python")
-import test_with_sklearn as twskl  # noqa
+from xgboost.testing.with_skl import (
+    run_boost_from_prediction_binary,
+    run_boost_from_prediction_multi_clasas,
+    run_housing_rf_regression,
+)
 
 pytestmark = pytest.mark.skipif(**tm.no_sklearn())
 
@@ -31,8 +32,15 @@ def test_gpu_binary_classification():
     for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
         for train_index, test_index in kf.split(X, y):
             xgb_model = cls(
-                random_state=42, tree_method="gpu_hist", n_estimators=4, gpu_id="0"
+                random_state=42,
+                tree_method="hist",
+                n_estimators=4,
+                device="cuda",
             ).fit(X[train_index], y[train_index])
+            cfg: str = json.loads(xgb_model.get_booster().save_config())["learner"][
+                "generic_param"
+            ]["device"]
+            assert cfg.startswith("cuda")
             preds = xgb_model.predict(X[test_index])
             labels = y[test_index]
             err = sum(
@@ -43,31 +51,31 @@ def test_gpu_binary_classification():
 
 @pytest.mark.skipif(**tm.no_cupy())
 @pytest.mark.skipif(**tm.no_cudf())
-def test_boost_from_prediction_gpu_hist():
+@pytest.mark.parametrize("tree_method", ["hist", "approx"])
+def test_boost_from_prediction_gpu_hist(tree_method: str) -> None:
     import cudf
     import cupy as cp
     from sklearn.datasets import load_breast_cancer, load_digits
 
-    tree_method = "gpu_hist"
     X, y = load_breast_cancer(return_X_y=True)
     X, y = cp.array(X), cp.array(y)
 
-    twskl.run_boost_from_prediction_binary(tree_method, X, y, None)
-    twskl.run_boost_from_prediction_binary(tree_method, X, y, cudf.DataFrame)
+    run_boost_from_prediction_binary(tree_method, "cuda", X, y, None)
+    run_boost_from_prediction_binary(tree_method, "cuda", X, y, cudf.DataFrame)
 
     X, y = load_digits(return_X_y=True)
     X, y = cp.array(X), cp.array(y)
 
-    twskl.run_boost_from_prediction_multi_clasas(
-        xgb.XGBClassifier, tree_method, X, y, None
+    run_boost_from_prediction_multi_clasas(
+        xgb.XGBClassifier, tree_method, "cuda", X, y, None
     )
-    twskl.run_boost_from_prediction_multi_clasas(
-        xgb.XGBClassifier, tree_method, X, y, cudf.DataFrame
+    run_boost_from_prediction_multi_clasas(
+        xgb.XGBClassifier, tree_method, "cuda", X, y, cudf.DataFrame
     )
 
 
-def test_num_parallel_tree():
-    twskl.run_housing_rf_regression("gpu_hist")
+def test_num_parallel_tree() -> None:
+    run_housing_rf_regression("hist", "cuda")
 
 
 @pytest.mark.skipif(**tm.no_pandas())
@@ -258,7 +266,7 @@ def wrong_shape_3(labels, predt):
 def test_ranking_qid_df():
     import cudf
 
-    run_ranking_qid_df(cudf, "gpu_hist")
+    run_ranking_qid_df(cudf, "hist", "cuda")
 
 
 @pytest.mark.skipif(**tm.no_pandas())
diff --git a/tests/python-gpu/test_large_input.py b/tests/python-gpu/test_large_input.py
index 2d85cabc809a..abf3ff7abee5 100644
--- a/tests/python-gpu/test_large_input.py
+++ b/tests/python-gpu/test_large_input.py
@@ -19,6 +19,8 @@ def test_large_input():
     y = cp.ones(m)
     w = cp.ones(m)
     dmat = xgb.QuantileDMatrix(X, y, weight=w)
-    booster = xgb.train({"tree_method": "gpu_hist", "max_depth": 1}, dmat, 1)
+    booster = xgb.train(
+        {"tree_method": "hist", "max_depth": 1, "device": "cuda"}, dmat, 1
+    )
     del y
     booster.inplace_predict(X)
diff --git a/tests/python-gpu/test_monotonic_constraints.py b/tests/python-gpu/test_monotonic_constraints.py
index 4586b617a9bb..606f41b6d68f 100644
--- a/tests/python-gpu/test_monotonic_constraints.py
+++ b/tests/python-gpu/test_monotonic_constraints.py
@@ -1,26 +1,22 @@
-import sys
-
 import numpy as np
 import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
-
-sys.path.append("tests/python")
-import test_monotone_constraints as tmc
+from xgboost.testing.monotone_constraints import is_correctly_constrained, training_dset
 
 rng = np.random.RandomState(1994)
 
 
-def non_decreasing(L):
+def non_decreasing(L: np.ndarray) -> bool:
     return all((x - y) < 0.001 for x, y in zip(L, L[1:]))
 
 
-def non_increasing(L):
+def non_increasing(L: np.ndarray) -> bool:
     return all((y - x) < 0.001 for x, y in zip(L, L[1:]))
 
 
-def assert_constraint(constraint, tree_method):
+def assert_constraint(constraint: int, tree_method: str) -> None:
     from sklearn.datasets import make_regression
 
     n = 1000
@@ -28,6 +24,7 @@ def assert_constraint(constraint, tree_method):
     dtrain = xgb.DMatrix(X, y)
     param = {}
     param["tree_method"] = tree_method
+    param["device"] = "cuda"
     param["monotone_constraints"] = "(" + str(constraint) + ")"
     bst = xgb.train(param, dtrain)
     dpredict = xgb.DMatrix(X[X[:, 0].argsort()])
@@ -41,25 +38,33 @@ def assert_constraint(constraint, tree_method):
 
 @pytest.mark.skipif(**tm.no_sklearn())
 def test_gpu_hist_basic():
-    assert_constraint(1, "gpu_hist")
-    assert_constraint(-1, "gpu_hist")
+    assert_constraint(1, "hist")
+    assert_constraint(-1, "hist")
+
+
+@pytest.mark.skipif(**tm.no_sklearn())
+def test_gpu_approx_basic():
+    assert_constraint(1, "approx")
+    assert_constraint(-1, "approx")
 
 
 def test_gpu_hist_depthwise():
     params = {
-        "tree_method": "gpu_hist",
+        "tree_method": "hist",
         "grow_policy": "depthwise",
+        "device": "cuda",
         "monotone_constraints": "(1, -1)",
     }
-    model = xgb.train(params, tmc.training_dset)
-    tmc.is_correctly_constrained(model)
+    model = xgb.train(params, training_dset)
+    is_correctly_constrained(model)
 
 
 def test_gpu_hist_lossguide():
     params = {
-        "tree_method": "gpu_hist",
+        "tree_method": "hist",
         "grow_policy": "lossguide",
+        "device": "cuda",
         "monotone_constraints": "(1, -1)",
     }
-    model = xgb.train(params, tmc.training_dset)
-    tmc.is_correctly_constrained(model)
+    model = xgb.train(params, training_dset)
+    is_correctly_constrained(model)
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index b24152e5dc9a..f2392353ac6c 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -1,7 +1,6 @@
 import json
 import os
 import tempfile
-from typing import Optional
 
 import numpy as np
 import pytest
@@ -9,12 +8,9 @@
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.core import Integer
+from xgboost.testing.basic_models import run_custom_objective
 from xgboost.testing.updater import ResetStrategy
 
-dpath = tm.data_dir(__file__)
-
-rng = np.random.RandomState(1994)
-
 
 class TestModels:
     def test_glm(self):
@@ -157,73 +153,11 @@ def test_boost_from_existing_model(self) -> None:
         )
         assert booster.num_boosted_rounds() == 8
 
-    def run_custom_objective(self, tree_method: Optional[str] = None):
-        param = {
-            "max_depth": 2,
-            "eta": 1,
-            "objective": "reg:logistic",
-            "tree_method": tree_method,
-        }
+    def test_custom_objective(self) -> None:
         dtrain, dtest = tm.load_agaricus(__file__)
-        watchlist = [(dtest, "eval"), (dtrain, "train")]
-        num_round = 10
-
-        def evalerror(preds: np.ndarray, dtrain: xgb.DMatrix):
-            return tm.eval_error_metric(preds, dtrain, rev_link=True)
-
-        # test custom_objective in training
-        bst = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            watchlist,
-            obj=tm.logregobj,
-            custom_metric=evalerror,
-        )
-        assert isinstance(bst, xgb.Booster)
-        preds = bst.predict(dtest)
-        labels = dtest.get_label()
-        err = sum(
-            1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
-        ) / float(len(preds))
-        assert err < 0.1
-
-        # test custom_objective in cross-validation
-        xgb.cv(
-            param,
-            dtrain,
-            num_round,
-            nfold=5,
-            seed=0,
-            obj=tm.logregobj,
-            custom_metric=evalerror,
-        )
-
-        # test maximize parameter
-        def neg_evalerror(preds, dtrain):
-            labels = dtrain.get_label()
-            preds = 1.0 / (1.0 + np.exp(-preds))
-            return "error", float(sum(labels == (preds > 0.0))) / len(labels)
-
-        bst2 = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            evals=watchlist,
-            obj=tm.logregobj,
-            custom_metric=neg_evalerror,
-            maximize=True,
-        )
-        preds2 = bst2.predict(dtest)
-        err2 = sum(
-            1 for i in range(len(preds2)) if int(preds2[i] > 0.5) != labels[i]
-        ) / float(len(preds2))
-        assert err == err2
-
-    def test_custom_objective(self):
-        self.run_custom_objective()
+        run_custom_objective("hist", "cpu", dtrain, dtest)
 
-    def test_multi_eval_metric(self):
+    def test_multi_eval_metric(self) -> None:
         dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, "eval"), (dtrain, "train")]
         param = {
@@ -565,6 +499,7 @@ def test_feature_info(self, ext: str) -> None:
         # make data
         rows = 100
         cols = 10
+        rng = np.random.RandomState(1994)
         X = rng.randn(rows, cols)
         y = rng.randn(rows)
 
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index 1ee31d6610c1..c2906747666c 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -1,4 +1,3 @@
-import json
 import os
 import tempfile
 from collections import namedtuple
@@ -9,6 +8,11 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.callbacks import (
+    run_eta_decay,
+    run_eta_decay_leaf_output,
+    tree_methods_objs,
+)
 
 # We use the dataset for tests.
 pytestmark = pytest.mark.skipif(**tm.no_sklearn())
@@ -302,179 +306,15 @@ def test_early_stopping_multiple_metrics(self):
         assert clf.best_iteration > 50
         assert clf.evals_result()["validation_0"]["auc"][-1] > 0.99
 
-    def run_eta_decay(self, tree_method: str) -> None:
-        """Test learning rate scheduler, used by both CPU and GPU tests."""
-        scheduler = xgb.callback.LearningRateScheduler
-
-        dtrain, dtest = tm.load_agaricus(__file__)
-
-        watchlist = [(dtest, "eval"), (dtrain, "train")]
-        num_round = 4
-
-        # learning_rates as a list
-        # init eta with 0 to check whether learning_rates work
-        param = {
-            "max_depth": 2,
-            "eta": 0,
-            "objective": "binary:logistic",
-            "eval_metric": "error",
-            "tree_method": tree_method,
-        }
-        evals_result = {}
-        bst = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            evals=watchlist,
-            callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
-            evals_result=evals_result,
-        )
-        eval_errors_0 = list(map(float, evals_result["eval"]["error"]))
-        assert isinstance(bst, xgb.core.Booster)
-        # validation error should decrease, if eta > 0
-        assert eval_errors_0[0] > eval_errors_0[-1]
-
-        # init learning_rate with 0 to check whether learning_rates work
-        param = {
-            "max_depth": 2,
-            "learning_rate": 0,
-            "objective": "binary:logistic",
-            "eval_metric": "error",
-            "tree_method": tree_method,
-        }
-        evals_result = {}
-
-        bst = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            evals=watchlist,
-            callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
-            evals_result=evals_result,
-        )
-        eval_errors_1 = list(map(float, evals_result["eval"]["error"]))
-        assert isinstance(bst, xgb.core.Booster)
-        # validation error should decrease, if learning_rate > 0
-        assert eval_errors_1[0] > eval_errors_1[-1]
-
-        # check if learning_rates override default value of eta/learning_rate
-        param = {
-            "max_depth": 2,
-            "objective": "binary:logistic",
-            "eval_metric": "error",
-            "tree_method": tree_method,
-        }
-        evals_result = {}
-        bst = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            evals=watchlist,
-            callbacks=[scheduler([0, 0, 0, 0])],
-            evals_result=evals_result,
-        )
-        eval_errors_2 = list(map(float, evals_result["eval"]["error"]))
-        assert isinstance(bst, xgb.core.Booster)
-        # validation error should not decrease, if eta/learning_rate = 0
-        assert eval_errors_2[0] == eval_errors_2[-1]
-
-        # learning_rates as a customized decay function
-        def eta_decay(ithround, num_boost_round=num_round):
-            return num_boost_round / (ithround + 1)
-
-        evals_result = {}
-        bst = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            evals=watchlist,
-            callbacks=[scheduler(eta_decay)],
-            evals_result=evals_result,
-        )
-        eval_errors_3 = list(map(float, evals_result["eval"]["error"]))
-
-        assert isinstance(bst, xgb.core.Booster)
-
-        assert eval_errors_3[0] == eval_errors_2[0]
-
-        for i in range(1, len(eval_errors_0)):
-            assert eval_errors_3[i] != eval_errors_2[i]
-
-        xgb.cv(param, dtrain, num_round, callbacks=[scheduler(eta_decay)])
-
-    def run_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
-        # check decay has effect on leaf output.
-        num_round = 4
-        scheduler = xgb.callback.LearningRateScheduler
-
+    @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
+    def test_eta_decay(self, tree_method: str) -> None:
         dtrain, dtest = tm.load_agaricus(__file__)
-        watchlist = [(dtest, "eval"), (dtrain, "train")]
-
-        param = {
-            "max_depth": 2,
-            "objective": objective,
-            "eval_metric": "error",
-            "tree_method": tree_method,
-        }
-        if objective == "reg:quantileerror":
-            param["quantile_alpha"] = 0.3
-
-        def eta_decay_0(i):
-            return num_round / (i + 1)
-
-        bst0 = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            evals=watchlist,
-            callbacks=[scheduler(eta_decay_0)],
-        )
+        run_eta_decay(tree_method, dtrain, dtest, "cpu")
 
-        def eta_decay_1(i: int) -> float:
-            if i > 1:
-                return 5.0
-            return num_round / (i + 1)
-
-        bst1 = xgb.train(
-            param,
-            dtrain,
-            num_round,
-            evals=watchlist,
-            callbacks=[scheduler(eta_decay_1)],
-        )
-        bst_json0 = bst0.save_raw(raw_format="json")
-        bst_json1 = bst1.save_raw(raw_format="json")
-
-        j0 = json.loads(bst_json0)
-        j1 = json.loads(bst_json1)
-
-        tree_2th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][2]
-        tree_2th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][2]
-        assert tree_2th_0["base_weights"] == tree_2th_1["base_weights"]
-        assert tree_2th_0["split_conditions"] == tree_2th_1["split_conditions"]
-
-        tree_3th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][3]
-        tree_3th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][3]
-        assert tree_3th_0["base_weights"] != tree_3th_1["base_weights"]
-        assert tree_3th_0["split_conditions"] != tree_3th_1["split_conditions"]
-
-    @pytest.mark.parametrize("tree_method", ["hist", "approx", "approx"])
-    def test_eta_decay(self, tree_method: str) -> None:
-        self.run_eta_decay(tree_method)
-
-    @pytest.mark.parametrize(
-        "tree_method,objective",
-        [
-            ("hist", "binary:logistic"),
-            ("hist", "reg:absoluteerror"),
-            ("hist", "reg:quantileerror"),
-            ("approx", "binary:logistic"),
-            ("approx", "reg:absoluteerror"),
-            ("approx", "reg:quantileerror"),
-        ],
-    )
+    @pytest.mark.parametrize("tree_method,objective", tree_methods_objs())
     def test_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
-        self.run_eta_decay_leaf_output(tree_method, objective)
+        dtrain, dtest = tm.load_agaricus(__file__)
+        run_eta_decay_leaf_output(tree_method, objective, dtrain, dtest, "cpu")
 
     def test_check_point(self, breast_cancer: BreastCancer) -> None:
         X, y = breast_cancer.full
diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py
index b02f348013fb..406d4258bd0c 100644
--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@@ -3,7 +3,15 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.metrics import check_precision_score, check_quantile_error
+from xgboost.testing.metrics import (
+    check_precision_score,
+    check_quantile_error,
+    run_pr_auc_binary,
+    run_pr_auc_ltr,
+    run_pr_auc_multi,
+    run_roc_auc_binary,
+    run_roc_auc_multi,
+)
 
 rng = np.random.RandomState(1337)
 
@@ -187,184 +195,29 @@ def test_gamma_lik(self) -> None:
 
         np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
 
-    def run_roc_auc_binary(self, tree_method, n_samples):
-        import numpy as np
-        from sklearn.datasets import make_classification
-        from sklearn.metrics import roc_auc_score
-
-        rng = np.random.RandomState(1994)
-        n_samples = n_samples
-        n_features = 10
-
-        X, y = make_classification(
-            n_samples,
-            n_features,
-            n_informative=n_features,
-            n_redundant=0,
-            random_state=rng,
-        )
-        Xy = xgb.DMatrix(X, y)
-        booster = xgb.train(
-            {
-                "tree_method": tree_method,
-                "eval_metric": "auc",
-                "objective": "binary:logistic",
-            },
-            Xy,
-            num_boost_round=1,
-        )
-        score = booster.predict(Xy)
-        skl_auc = roc_auc_score(y, score)
-        auc = float(booster.eval(Xy).split(":")[1])
-        np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
-
-        X = rng.randn(*X.shape)
-        score = booster.predict(xgb.DMatrix(X))
-        skl_auc = roc_auc_score(y, score)
-        auc = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1])
-        np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
-
     @pytest.mark.skipif(**tm.no_sklearn())
     @pytest.mark.parametrize("n_samples", [100, 1000, 10000])
-    def test_roc_auc(self, n_samples):
-        self.run_roc_auc_binary("hist", n_samples)
-
-    def run_roc_auc_multi(self, tree_method, n_samples, weighted):
-        import numpy as np
-        from sklearn.datasets import make_classification
-        from sklearn.metrics import roc_auc_score
-
-        rng = np.random.RandomState(1994)
-        n_samples = n_samples
-        n_features = 10
-        n_classes = 4
-
-        X, y = make_classification(
-            n_samples,
-            n_features,
-            n_informative=n_features,
-            n_redundant=0,
-            n_classes=n_classes,
-            random_state=rng,
-        )
-        if weighted:
-            weights = rng.randn(n_samples)
-            weights -= weights.min()
-            weights /= weights.max()
-        else:
-            weights = None
-
-        Xy = xgb.DMatrix(X, y, weight=weights)
-        booster = xgb.train(
-            {
-                "tree_method": tree_method,
-                "eval_metric": "auc",
-                "objective": "multi:softprob",
-                "num_class": n_classes,
-            },
-            Xy,
-            num_boost_round=1,
-        )
-        score = booster.predict(Xy)
-        skl_auc = roc_auc_score(
-            y, score, average="weighted", sample_weight=weights, multi_class="ovr"
-        )
-        auc = float(booster.eval(Xy).split(":")[1])
-        np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
-
-        X = rng.randn(*X.shape)
-
-        score = booster.predict(xgb.DMatrix(X, weight=weights))
-        skl_auc = roc_auc_score(
-            y, score, average="weighted", sample_weight=weights, multi_class="ovr"
-        )
-        auc = float(booster.eval(xgb.DMatrix(X, y, weight=weights)).split(":")[1])
-        np.testing.assert_allclose(skl_auc, auc, rtol=1e-5)
+    def test_roc_auc(self, n_samples: int) -> None:
+        run_roc_auc_binary("hist", n_samples, "cpu")
 
     @pytest.mark.parametrize(
         "n_samples,weighted", [(4, False), (100, False), (1000, False), (10000, True)]
     )
-    def test_roc_auc_multi(self, n_samples, weighted):
-        self.run_roc_auc_multi("hist", n_samples, weighted)
-
-    def run_pr_auc_binary(self, tree_method):
-        from sklearn.datasets import make_classification
-        from sklearn.metrics import auc, precision_recall_curve
-
-        X, y = make_classification(128, 4, n_classes=2, random_state=1994)
-        clf = xgb.XGBClassifier(
-            tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
-        )
-        clf.fit(X, y, eval_set=[(X, y)])
-        evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
-
-        y_score = clf.predict_proba(X)[:, 1]  # get the positive column
-        precision, recall, _ = precision_recall_curve(y, y_score)
-        prauc = auc(recall, precision)
-        # Interpolation results are slightly different from sklearn, but overall should
-        # be similar.
-        np.testing.assert_allclose(prauc, evals_result, rtol=1e-2)
-
-        clf = xgb.XGBClassifier(
-            tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
-        )
-        clf.fit(X, y, eval_set=[(X, y)])
-        evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
-        np.testing.assert_allclose(0.99, evals_result, rtol=1e-2)
-
-    def test_pr_auc_binary(self):
-        self.run_pr_auc_binary("hist")
+    def test_roc_auc_multi(self, n_samples: int, weighted: bool) -> None:
+        run_roc_auc_multi("hist", n_samples, weighted, "cpu")
 
-    def run_pr_auc_multi(self, tree_method):
-        from sklearn.datasets import make_classification
+    def test_pr_auc_binary(self) -> None:
+        run_pr_auc_binary("hist", "cpu")
 
-        X, y = make_classification(
-            64, 16, n_informative=8, n_classes=3, random_state=1994
-        )
-        clf = xgb.XGBClassifier(
-            tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
-        )
-        clf.fit(X, y, eval_set=[(X, y)])
-        evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
-        # No available implementation for comparison, just check that XGBoost converges
-        # to 1.0
-        clf = xgb.XGBClassifier(
-            tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
-        )
-        clf.fit(X, y, eval_set=[(X, y)])
-        evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
-        np.testing.assert_allclose(1.0, evals_result, rtol=1e-2)
-
-    def test_pr_auc_multi(self):
-        self.run_pr_auc_multi("hist")
-
-    def run_pr_auc_ltr(self, tree_method):
-        from sklearn.datasets import make_classification
-
-        X, y = make_classification(128, 4, n_classes=2, random_state=1994)
-        ltr = xgb.XGBRanker(
-            tree_method=tree_method,
-            n_estimators=16,
-            objective="rank:pairwise",
-            eval_metric="aucpr",
-        )
-        groups = np.array([32, 32, 64])
-        ltr.fit(
-            X,
-            y,
-            group=groups,
-            eval_set=[(X, y)],
-            eval_group=[groups],
-        )
-        results = ltr.evals_result()["validation_0"]["aucpr"]
-        assert results[-1] >= 0.99
+    def test_pr_auc_multi(self) -> None:
+        run_pr_auc_multi("hist", "cpu")
 
-    def test_pr_auc_ltr(self):
-        self.run_pr_auc_ltr("hist")
+    def test_pr_auc_ltr(self) -> None:
+        run_pr_auc_ltr("hist", "cpu")
 
-    def test_precision_score(self):
-        check_precision_score("hist")
+    def test_precision_score(self) -> None:
+        check_precision_score("hist", "cpu")
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_quantile_error(self) -> None:
-        check_quantile_error("hist")
+        check_quantile_error("hist", "cpu")
diff --git a/tests/python/test_interaction_constraints.py b/tests/python/test_interaction_constraints.py
index 5eaaf1f8c78f..6c95730a8ff4 100644
--- a/tests/python/test_interaction_constraints.py
+++ b/tests/python/test_interaction_constraints.py
@@ -1,118 +1,59 @@
-import numpy as np
 import pytest
 
-import xgboost
 from xgboost import testing as tm
-
-dpath = 'demo/data/'
-rng = np.random.RandomState(1994)
+from xgboost.testing.interaction_constraints import (
+    run_interaction_constraints,
+    training_accuracy,
+)
 
 
 class TestInteractionConstraints:
-    def run_interaction_constraints(
-        self, tree_method, feature_names=None, interaction_constraints='[[0, 1]]'
-    ):
-        x1 = np.random.normal(loc=1.0, scale=1.0, size=1000)
-        x2 = np.random.normal(loc=1.0, scale=1.0, size=1000)
-        x3 = np.random.choice([1, 2, 3], size=1000, replace=True)
-        y = x1 + x2 + x3 + x1 * x2 * x3 \
-            + np.random.normal(
-                loc=0.001, scale=1.0, size=1000) + 3 * np.sin(x1)
-        X = np.column_stack((x1, x2, x3))
-        dtrain = xgboost.DMatrix(X, label=y, feature_names=feature_names)
-
-        params = {
-            'max_depth': 3,
-            'eta': 0.1,
-            'nthread': 2,
-            'interaction_constraints': interaction_constraints,
-            'tree_method': tree_method
-        }
-        num_boost_round = 12
-        # Fit a model that only allows interaction between x1 and x2
-        bst = xgboost.train(
-            params, dtrain, num_boost_round, evals=[(dtrain, 'train')])
-
-        # Set all observations to have the same x3 values then increment
-        #   by the same amount
-        def f(x):
-            tmat = xgboost.DMatrix(
-                np.column_stack((x1, x2, np.repeat(x, 1000))), feature_names=feature_names)
-            return bst.predict(tmat)
-
-        preds = [f(x) for x in [1, 2, 3]]
+    def test_exact_interaction_constraints(self) -> None:
+        run_interaction_constraints(tree_method="exact", device="cpu")
 
-        # Check incrementing x3 has the same effect on all observations
-        #   since x3 is constrained to be independent of x1 and x2
-        #   and all observations start off from the same x3 value
-        diff1 = preds[1] - preds[0]
-        assert np.all(np.abs(diff1 - diff1[0]) < 1e-4)
-        diff2 = preds[2] - preds[1]
-        assert np.all(np.abs(diff2 - diff2[0]) < 1e-4)
+    def test_hist_interaction_constraints(self) -> None:
+        run_interaction_constraints(tree_method="hist", device="cpu")
 
-    def test_exact_interaction_constraints(self):
-        self.run_interaction_constraints(tree_method='exact')
+    def test_approx_interaction_constraints(self) -> None:
+        run_interaction_constraints(tree_method="approx", device="cpu")
 
-    def test_hist_interaction_constraints(self):
-        self.run_interaction_constraints(tree_method='hist')
-
-    def test_approx_interaction_constraints(self):
-        self.run_interaction_constraints(tree_method='approx')
-
-    def test_interaction_constraints_feature_names(self):
+    def test_interaction_constraints_feature_names(self) -> None:
         with pytest.raises(ValueError):
-            constraints = [('feature_0', 'feature_1')]
-            self.run_interaction_constraints(tree_method='exact',
-                                             interaction_constraints=constraints)
+            constraints = [("feature_0", "feature_1")]
+            run_interaction_constraints(
+                tree_method="exact", device="cpu", interaction_constraints=constraints
+            )
 
         with pytest.raises(ValueError):
-            constraints = [('feature_0', 'feature_3')]
-            feature_names = ['feature_0', 'feature_1', 'feature_2']
-            self.run_interaction_constraints(tree_method='exact',
-                                             feature_names=feature_names,
-                                             interaction_constraints=constraints)
-
-        constraints = [('feature_0', 'feature_1')]
-        feature_names = ['feature_0', 'feature_1', 'feature_2']
-        self.run_interaction_constraints(tree_method='exact',
-                                         feature_names=feature_names,
-                                         interaction_constraints=constraints)
-
-        constraints = [['feature_0', 'feature_1'], ['feature_2']]
-        feature_names = ['feature_0', 'feature_1', 'feature_2']
-        self.run_interaction_constraints(tree_method='exact',
-                                         feature_names=feature_names,
-                                         interaction_constraints=constraints)
-
-    @pytest.mark.skipif(**tm.no_sklearn())
-    def training_accuracy(self, tree_method):
-        """Test accuracy, reused by GPU tests."""
-        from sklearn.metrics import accuracy_score
-        dtrain = xgboost.DMatrix(
-            dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm"
-        )
-        dtest = xgboost.DMatrix(
-            dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm"
+            constraints = [("feature_0", "feature_3")]
+            feature_names = ["feature_0", "feature_1", "feature_2"]
+            run_interaction_constraints(
+                tree_method="exact",
+                device="cpu",
+                feature_names=feature_names,
+                interaction_constraints=constraints,
+            )
+
+        constraints = [("feature_0", "feature_1")]
+        feature_names = ["feature_0", "feature_1", "feature_2"]
+        run_interaction_constraints(
+            tree_method="exact",
+            device="cpu",
+            feature_names=feature_names,
+            interaction_constraints=constraints,
         )
-        params = {
-            'eta': 1,
-            'max_depth': 6,
-            'objective': 'binary:logistic',
-            'tree_method': tree_method,
-            'interaction_constraints': '[[1,2], [2,3,4]]'
-        }
-        num_boost_round = 5
 
-        params['grow_policy'] = 'lossguide'
-        bst = xgboost.train(params, dtrain, num_boost_round)
-        pred_dtest = (bst.predict(dtest) < 0.5)
-        assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
-
-        params['grow_policy'] = 'depthwise'
-        bst = xgboost.train(params, dtrain, num_boost_round)
-        pred_dtest = (bst.predict(dtest) < 0.5)
-        assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
+        constraints_lst = [["feature_0", "feature_1"], ["feature_2"]]
+        feature_names = ["feature_0", "feature_1", "feature_2"]
+        run_interaction_constraints(
+            tree_method="exact",
+            device="cpu",
+            feature_names=feature_names,
+            interaction_constraints=constraints_lst,
+        )
 
+    @pytest.mark.skipif(**tm.no_sklearn())
     @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
-    def test_hist_training_accuracy(self, tree_method):
-        self.training_accuracy(tree_method=tree_method)
+    def test_hist_training_accuracy(self, tree_method: str) -> None:
+        dpath = "demo/data/"
+        training_accuracy(tree_method=tree_method, dpath=dpath, device="cpu")
diff --git a/tests/python/test_monotone_constraints.py b/tests/python/test_monotone_constraints.py
index a3785f1cbd0c..312d363c4c91 100644
--- a/tests/python/test_monotone_constraints.py
+++ b/tests/python/test_monotone_constraints.py
@@ -1,10 +1,13 @@
+from typing import Type
+
 import numpy as np
 import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.monotone_constraints import training_dset, x, y
 
-dpath = 'demo/data/'
+dpath = "demo/data/"
 
 
 def is_increasing(y):
@@ -23,53 +26,34 @@ def is_correctly_constrained(learner, feature_names=None):
     for i in range(n):
         fixed_x = fixed_xs_values[i] * np.ones((n, 1))
         monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
-        monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x,
-                                                    feature_names=feature_names)
-        monotonically_increasing_y = learner.predict(
-            monotonically_increasing_dset
+        monotonically_increasing_dset = xgb.DMatrix(
+            monotonically_increasing_x, feature_names=feature_names
         )
+        monotonically_increasing_y = learner.predict(monotonically_increasing_dset)
 
         monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
-        monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x,
-                                                    feature_names=feature_names)
-        monotonically_decreasing_y = learner.predict(
-            monotonically_decreasing_dset
+        monotonically_decreasing_dset = xgb.DMatrix(
+            monotonically_decreasing_x, feature_names=feature_names
         )
+        monotonically_decreasing_y = learner.predict(monotonically_decreasing_dset)
 
         if not (
-            is_increasing(monotonically_increasing_y) and
-            is_decreasing(monotonically_decreasing_y)
+            is_increasing(monotonically_increasing_y)
+            and is_decreasing(monotonically_decreasing_y)
         ):
             return False
 
     return True
 
 
-number_of_dpoints = 1000
-x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
-x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
-
-x = np.column_stack((
-    x1_positively_correlated_with_y, x2_negatively_correlated_with_y
-))
-zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints)
-y = (
-    5 * x1_positively_correlated_with_y +
-    np.sin(10 * np.pi * x1_positively_correlated_with_y) -
-    5 * x2_negatively_correlated_with_y -
-    np.cos(10 * np.pi * x2_negatively_correlated_with_y) +
-    zs
-)
-training_dset = xgb.DMatrix(x, label=y)
-
-
 class TestMonotoneConstraints:
-    def test_monotone_constraints_for_exact_tree_method(self):
+    def test_monotone_constraints_for_exact_tree_method(self) -> None:
 
         # first check monotonicity for the 'exact' tree method
         params_for_constrained_exact_method = {
-            'tree_method': 'exact', 'verbosity': 1,
-            'monotone_constraints': '(1, -1)'
+            "tree_method": "exact",
+            "verbosity": 1,
+            "monotone_constraints": "(1, -1)",
         }
         constrained_exact_method = xgb.train(
             params_for_constrained_exact_method, training_dset
@@ -99,14 +83,14 @@ def test_monotone_constraints_tuple(self) -> None:
         constrained = xgb.train(params_for_constrained, training_dset)
         assert is_correctly_constrained(constrained)
 
-    @pytest.mark.parametrize('format', [dict, list])
-    def test_monotone_constraints_feature_names(self, format):
+    @pytest.mark.parametrize("format", [dict, list])
+    def test_monotone_constraints_feature_names(self, format: Type) -> None:
 
         # next check monotonicity when initializing monotone_constraints by feature names
         params = {
-            'tree_method': 'hist',
-            'grow_policy': 'lossguide',
-            'monotone_constraints': {'feature_0': 1, 'feature_1': -1}
+            "tree_method": "hist",
+            "grow_policy": "lossguide",
+            "monotone_constraints": {"feature_0": 1, "feature_1": -1},
         }
 
         if format == list:
@@ -115,37 +99,44 @@ def test_monotone_constraints_feature_names(self, format):
         with pytest.raises(ValueError):
             xgb.train(params, training_dset)
 
-        feature_names = ['feature_0', 'feature_2']
-        training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
+        feature_names = ["feature_0", "feature_2"]
+        training_dset_w_feature_names = xgb.DMatrix(
+            x, label=y, feature_names=feature_names
+        )
 
         with pytest.raises(ValueError):
             xgb.train(params, training_dset_w_feature_names)
 
-        feature_names = ['feature_0', 'feature_1']
-        training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
-
-        constrained_learner = xgb.train(
-            params, training_dset_w_feature_names
+        feature_names = ["feature_0", "feature_1"]
+        training_dset_w_feature_names = xgb.DMatrix(
+            x, label=y, feature_names=feature_names
         )
 
-        assert is_correctly_constrained(constrained_learner, feature_names)
+        constrained_learner = xgb.train(params, training_dset_w_feature_names)
 
+        assert is_correctly_constrained(constrained_learner, feature_names)
 
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_training_accuracy(self):
+    def test_training_accuracy(self) -> None:
         from sklearn.metrics import accuracy_score
+
         dtrain = xgb.DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm")
         dtest = xgb.DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm")
-        params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic',
-                  'tree_method': 'hist', 'monotone_constraints': '(1, 0)'}
+        params = {
+            "eta": 1,
+            "max_depth": 6,
+            "objective": "binary:logistic",
+            "tree_method": "hist",
+            "monotone_constraints": "(1, 0)",
+        }
         num_boost_round = 5
 
-        params['grow_policy'] = 'lossguide'
+        params["grow_policy"] = "lossguide"
         bst = xgb.train(params, dtrain, num_boost_round)
-        pred_dtest = (bst.predict(dtest) < 0.5)
+        pred_dtest = bst.predict(dtest) < 0.5
         assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
 
-        params['grow_policy'] = 'depthwise'
+        params["grow_policy"] = "depthwise"
         bst = xgb.train(params, dtrain, num_boost_round)
-        pred_dtest = (bst.predict(dtest) < 0.5)
+        pred_dtest = bst.predict(dtest) < 0.5
         assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
diff --git a/tests/python/test_parse_tree.py b/tests/python/test_parse_tree.py
index 1be6c1d3ba92..0155acca1d79 100644
--- a/tests/python/test_parse_tree.py
+++ b/tests/python/test_parse_tree.py
@@ -3,70 +3,57 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.parse_tree import (
+    run_split_value_histograms,
+    run_tree_to_df_categorical,
+)
 
 pytestmark = pytest.mark.skipif(**tm.no_pandas())
 
 
-dpath = 'demo/data/'
+dpath = "demo/data/"
 rng = np.random.RandomState(1994)
 
 
 class TestTreesToDataFrame:
     def build_model(self, max_depth, num_round):
         dtrain, _ = tm.load_agaricus(__file__)
-        param = {'max_depth': max_depth, 'objective': 'binary:logistic',
-                 'verbosity': 1}
+        param = {"max_depth": max_depth, "objective": "binary:logistic", "verbosity": 1}
         num_round = num_round
         bst = xgb.train(param, dtrain, num_round)
         return bst
 
     def parse_dumped_model(self, booster, item_to_get, splitter):
-        item_to_get += '='
+        item_to_get += "="
         txt_dump = booster.get_dump(with_stats=True)
-        tree_list = [tree.split('/n') for tree in txt_dump]
+        tree_list = [tree.split("/n") for tree in txt_dump]
         split_trees = [tree[0].split(item_to_get)[1:] for tree in tree_list]
-        res = sum([float(line.split(splitter)[0])
-                   for tree in split_trees for line in tree])
+        res = sum(
+            [float(line.split(splitter)[0]) for tree in split_trees for line in tree]
+        )
         return res
 
     def test_trees_to_dataframe(self):
         bst = self.build_model(max_depth=5, num_round=10)
-        gain_from_dump = self.parse_dumped_model(booster=bst,
-                                                 item_to_get='gain',
-                                                 splitter=',')
-        cover_from_dump = self.parse_dumped_model(booster=bst,
-                                                  item_to_get='cover',
-                                                  splitter='\n')
+        gain_from_dump = self.parse_dumped_model(
+            booster=bst, item_to_get="gain", splitter=","
+        )
+        cover_from_dump = self.parse_dumped_model(
+            booster=bst, item_to_get="cover", splitter="\n"
+        )
         # method being tested
         df = bst.trees_to_dataframe()
 
         # test for equality of gains
-        gain_from_df = df[df.Feature != 'Leaf'][['Gain']].sum()
+        gain_from_df = df[df.Feature != "Leaf"][["Gain"]].sum()
         assert np.allclose(gain_from_dump, gain_from_df)
 
         # test for equality of covers
         cover_from_df = df.Cover.sum()
         assert np.allclose(cover_from_dump, cover_from_df)
 
-    def run_tree_to_df_categorical(self, tree_method: str) -> None:
-        X, y = tm.make_categorical(100, 10, 31, onehot=False)
-        Xy = xgb.DMatrix(X, y, enable_categorical=True)
-        booster = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=10)
-        df = booster.trees_to_dataframe()
-        for _, x in df.iterrows():
-            if x["Feature"] != "Leaf":
-                assert len(x["Category"]) >= 1
-
     def test_tree_to_df_categorical(self) -> None:
-        self.run_tree_to_df_categorical("approx")
-
-    def run_split_value_histograms(self, tree_method) -> None:
-        X, y = tm.make_categorical(1000, 10, 13, onehot=False)
-        reg = xgb.XGBRegressor(tree_method=tree_method, enable_categorical=True)
-        reg.fit(X, y)
-
-        with pytest.raises(ValueError, match="doesn't"):
-            reg.get_booster().get_split_value_histogram("3", bins=5)
+        run_tree_to_df_categorical("approx", "cpu")
 
     def test_split_value_histograms(self):
-        self.run_split_value_histograms("approx")
+        run_split_value_histograms("approx", "cpu")
diff --git a/tests/python/test_pickling.py b/tests/python/test_pickling.py
index 2f4d77bf0901..198c7f0866fd 100644
--- a/tests/python/test_pickling.py
+++ b/tests/python/test_pickling.py
@@ -22,25 +22,25 @@ def run_model_pickling(self, xgb_params) -> str:
         dtrain = xgb.DMatrix(X, y)
         bst = xgb.train(xgb_params, dtrain)
 
-        dump_0 = bst.get_dump(dump_format='json')
+        dump_0 = bst.get_dump(dump_format="json")
         assert dump_0
         config_0 = bst.save_config()
 
-        filename = 'model.pkl'
+        filename = "model.pkl"
 
-        with open(filename, 'wb') as fd:
+        with open(filename, "wb") as fd:
             pickle.dump(bst, fd)
 
-        with open(filename, 'rb') as fd:
+        with open(filename, "rb") as fd:
             bst = pickle.load(fd)
 
-        with open(filename, 'wb') as fd:
+        with open(filename, "wb") as fd:
             pickle.dump(bst, fd)
 
-        with open(filename, 'rb') as fd:
+        with open(filename, "rb") as fd:
             bst = pickle.load(fd)
 
-        assert bst.get_dump(dump_format='json') == dump_0
+        assert bst.get_dump(dump_format="json") == dump_0
 
         if os.path.exists(filename):
             os.remove(filename)
diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py
index 1e1311c5750f..ddcf9c571116 100644
--- a/tests/python/test_plotting.py
+++ b/tests/python/test_plotting.py
@@ -1,53 +1,54 @@
-import json
-
 import numpy as np
 import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.plotting import run_categorical
 
 try:
     import matplotlib
-    matplotlib.use('Agg')
+
+    matplotlib.use("Agg")
     from graphviz import Source
     from matplotlib.axes import Axes
 except ImportError:
     pass
 
-pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(),
-                                                 tm.no_graphviz()))
+pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz()))
 
 
 class TestPlotting:
     def test_plotting(self):
         m, _ = tm.load_agaricus(__file__)
-        booster = xgb.train({'max_depth': 2, 'eta': 1,
-                             'objective': 'binary:logistic'}, m,
-                            num_boost_round=2)
+        booster = xgb.train(
+            {"max_depth": 2, "eta": 1, "objective": "binary:logistic"},
+            m,
+            num_boost_round=2,
+        )
 
         ax = xgb.plot_importance(booster)
         assert isinstance(ax, Axes)
-        assert ax.get_title() == 'Feature importance'
-        assert ax.get_xlabel() == 'Importance score'
-        assert ax.get_ylabel() == 'Features'
+        assert ax.get_title() == "Feature importance"
+        assert ax.get_xlabel() == "Importance score"
+        assert ax.get_ylabel() == "Features"
         assert len(ax.patches) == 4
 
-        ax = xgb.plot_importance(booster, color='r',
-                                 title='t', xlabel='x', ylabel='y')
+        ax = xgb.plot_importance(booster, color="r", title="t", xlabel="x", ylabel="y")
         assert isinstance(ax, Axes)
-        assert ax.get_title() == 't'
-        assert ax.get_xlabel() == 'x'
-        assert ax.get_ylabel() == 'y'
+        assert ax.get_title() == "t"
+        assert ax.get_xlabel() == "x"
+        assert ax.get_ylabel() == "y"
         assert len(ax.patches) == 4
         for p in ax.patches:
             assert p.get_facecolor() == (1.0, 0, 0, 1.0)  # red
 
-        ax = xgb.plot_importance(booster, color=['r', 'r', 'b', 'b'],
-                                 title=None, xlabel=None, ylabel=None)
+        ax = xgb.plot_importance(
+            booster, color=["r", "r", "b", "b"], title=None, xlabel=None, ylabel=None
+        )
         assert isinstance(ax, Axes)
-        assert ax.get_title() == ''
-        assert ax.get_xlabel() == ''
-        assert ax.get_ylabel() == ''
+        assert ax.get_title() == ""
+        assert ax.get_xlabel() == ""
+        assert ax.get_ylabel() == ""
         assert len(ax.patches) == 4
         assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0)  # red
         assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0)  # red
@@ -66,31 +67,13 @@ def test_importance_plot_lim(self):
         bst = xgb.train({}, dm)
         assert len(bst.get_fscore()) == 71
         ax = xgb.plot_importance(bst)
-        assert ax.get_xlim() == (0., 11.)
-        assert ax.get_ylim() == (-1., 71.)
+        assert ax.get_xlim() == (0.0, 11.0)
+        assert ax.get_ylim() == (-1.0, 71.0)
 
         ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71))
-        assert ax.get_xlim() == (0., 5.)
-        assert ax.get_ylim() == (10., 71.)
-
-    def run_categorical(self, tree_method: str) -> None:
-        X, y = tm.make_categorical(1000, 31, 19, onehot=False)
-        reg = xgb.XGBRegressor(
-            enable_categorical=True, n_estimators=10, tree_method=tree_method
-        )
-        reg.fit(X, y)
-        trees = reg.get_booster().get_dump(dump_format="json")
-        for tree in trees:
-            j_tree = json.loads(tree)
-            assert "leaf" in j_tree.keys() or isinstance(
-                j_tree["split_condition"], list
-            )
-
-        graph = xgb.to_graphviz(reg, tree_idx=len(j_tree) - 1)
-        assert isinstance(graph, Source)
-        ax = xgb.plot_tree(reg, tree_idx=len(j_tree) - 1)
-        assert isinstance(ax, Axes)
+        assert ax.get_xlim() == (0.0, 5.0)
+        assert ax.get_ylim() == (10.0, 71.0)
 
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical(self) -> None:
-        self.run_categorical("approx")
+        run_categorical("approx", "cpu")
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index aabf72d13202..80c22ee3a25e 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -329,11 +329,11 @@ def test_adaptive(self, tree_method: str, weighted: bool) -> None:
         run_adaptive(tree_method, weighted, "cpu")
 
     def test_init_estimation(self) -> None:
-        check_init_estimation("hist")
+        check_init_estimation("hist", "cpu")
 
     @pytest.mark.parametrize("weighted", [True, False])
     def test_quantile_loss(self, weighted: bool) -> None:
-        check_quantile_loss("hist", weighted)
+        check_quantile_loss("hist", weighted, "cpu")
 
     @pytest.mark.skipif(**tm.no_pandas())
     @pytest.mark.parametrize("tree_method", ["hist"])
diff --git a/tests/python/test_with_polars.py b/tests/python/test_with_polars.py
index 943ea9a44222..b687861a3918 100644
--- a/tests/python/test_with_polars.py
+++ b/tests/python/test_with_polars.py
@@ -15,7 +15,7 @@
 
 @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
 def test_polars_basic(
-    DMatrixT: Union[Type[xgb.DMatrix], Type[xgb.QuantileDMatrix]]
+    DMatrixT: Union[Type[xgb.DMatrix], Type[xgb.QuantileDMatrix]],
 ) -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
     Xy = DMatrixT(df)
@@ -134,12 +134,13 @@ def test_regressor() -> None:
 
     np.testing.assert_allclose(predt0, predt1)
 
-def test_categorical() ->  None:
+
+def test_categorical() -> None:
     import polars as pl
 
     df = pl.DataFrame(
         {"f0": [1, 2, 3], "b": ["a", "b", "c"]},
-        schema=[("a", pl.Int64()), ("b", pl.Categorical())]
+        schema=[("a", pl.Int64()), ("b", pl.Categorical())],
     )
     with pytest.raises(NotImplementedError, match="Categorical feature"):
         xgb.DMatrix(df, enable_categorical=True)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 8842112cf2c0..57346c00da39 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -4,7 +4,7 @@
 import re
 import tempfile
 import warnings
-from typing import Callable, Optional
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -15,6 +15,11 @@
 from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
 from xgboost.testing.updater import get_basescore
+from xgboost.testing.with_skl import (
+    run_boost_from_prediction_binary,
+    run_boost_from_prediction_multi_clasas,
+    run_housing_rf_regression,
+)
 
 rng = np.random.RandomState(1994)
 pytestmark = [pytest.mark.skipif(**tm.no_sklearn()), tm.timeout(30)]
@@ -213,7 +218,7 @@ def test_ranking_metric() -> None:
 def test_ranking_qid_df():
     import pandas as pd
 
-    run_ranking_qid_df(pd, "hist")
+    run_ranking_qid_df(pd, "hist", "cpu")
 
 
 def test_stacking_regression():
@@ -489,29 +494,8 @@ def test_regression():
             xgb_model.feature_names_in_
 
 
-def run_housing_rf_regression(tree_method):
-    from sklearn.datasets import fetch_california_housing
-    from sklearn.metrics import mean_squared_error
-    from sklearn.model_selection import KFold
-
-    X, y = fetch_california_housing(return_X_y=True)
-    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
-    for train_index, test_index in kf.split(X, y):
-        xgb_model = xgb.XGBRFRegressor(random_state=42, tree_method=tree_method).fit(
-            X[train_index], y[train_index]
-        )
-        preds = xgb_model.predict(X[test_index])
-        labels = y[test_index]
-        assert mean_squared_error(preds, labels) < 35
-
-    rfreg = xgb.XGBRFRegressor()
-    with pytest.raises(NotImplementedError):
-        rfreg.set_params(early_stopping_rounds=10)
-        rfreg.fit(X, y)
-
-
 def test_rf_regression():
-    run_housing_rf_regression("hist")
+    run_housing_rf_regression("hist", "cpu")
 
 
 @pytest.mark.parametrize("tree_method", ["exact", "hist", "approx"])
@@ -1217,89 +1201,29 @@ def test_feature_weights(tree_method):
         reg.fit(X, y, feature_weights=np.ones((kCols, )))
 
 
-def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Callable]):
-    """
-    Parameters
-    ----------
-
-    as_frame: A callable function to convert margin into DataFrame, useful for different
-    df implementations.
-    """
-
-    model_0 = xgb.XGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
-    )
-    model_0.fit(X=X, y=y)
-    margin = model_0.predict(X, output_margin=True)
-    if as_frame is not None:
-        margin = as_frame(margin)
-
-    model_1 = xgb.XGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
-    )
-    model_1.fit(X=X, y=y, base_margin=margin)
-    predictions_1 = model_1.predict(X, base_margin=margin)
-
-    cls_2 = xgb.XGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
-    )
-    cls_2.fit(X=X, y=y)
-    predictions_2 = cls_2.predict(X)
-    np.testing.assert_allclose(predictions_1, predictions_2)
-
-
-def run_boost_from_prediction_multi_clasas(
-    estimator, tree_method, X, y, as_frame: Optional[Callable]
-):
-    # Multi-class
-    model_0 = estimator(
-        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
-    )
-    model_0.fit(X=X, y=y)
-    margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
-    if as_frame is not None:
-        margin = as_frame(margin)
-
-    model_1 = estimator(
-        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
-    )
-    model_1.fit(X=X, y=y, base_margin=margin)
-    predictions_1 = model_1.get_booster().predict(
-        xgb.DMatrix(X, base_margin=margin), output_margin=True
-    )
-
-    model_2 = estimator(
-        learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
-    )
-    model_2.fit(X=X, y=y)
-    predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")
-
-    if hasattr(predictions_1, "get"):
-        predictions_1 = predictions_1.get()
-    if hasattr(predictions_2, "get"):
-        predictions_2 = predictions_2.get()
-    np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)
-
-
 @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
-def test_boost_from_prediction(tree_method):
+def test_boost_from_prediction(tree_method: str) -> None:
     import pandas as pd
     from sklearn.datasets import load_breast_cancer, load_iris, make_regression
 
     X, y = load_breast_cancer(return_X_y=True)
 
-    run_boost_from_prediction_binary(tree_method, X, y, None)
-    run_boost_from_prediction_binary(tree_method, X, y, pd.DataFrame)
+    run_boost_from_prediction_binary(tree_method, "cpu", X, y, None)
+    run_boost_from_prediction_binary(tree_method, "cpu", X, y, pd.DataFrame)
 
     X, y = load_iris(return_X_y=True)
 
-    run_boost_from_prediction_multi_clasas(xgb.XGBClassifier, tree_method, X, y, None)
     run_boost_from_prediction_multi_clasas(
-        xgb.XGBClassifier, tree_method, X, y, pd.DataFrame
+        xgb.XGBClassifier, tree_method, "cpu", X, y, None
+    )
+    run_boost_from_prediction_multi_clasas(
+        xgb.XGBClassifier, tree_method, "cpu", X, y, pd.DataFrame
     )
 
     X, y = make_regression(n_samples=100, n_targets=4)
-    run_boost_from_prediction_multi_clasas(xgb.XGBRegressor, tree_method, X, y, None)
+    run_boost_from_prediction_multi_clasas(
+        xgb.XGBRegressor, tree_method, "cpu", X, y, None
+    )
 
 
 def test_estimator_type():
@@ -1476,6 +1400,7 @@ def merror(y_true: np.ndarray, predt: np.ndarray):
 def test_weighted_evaluation_metric():
     from sklearn.datasets import make_hastie_10_2
     from sklearn.metrics import log_loss
+
     X, y = make_hastie_10_2(n_samples=2000, random_state=42)
     labels, y = np.unique(y, return_inverse=True)
     X_train, X_test = X[:1600], X[1600:]

From a516df3cf9ed4502245295a026d235ae092271c3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 15 Apr 2025 21:53:10 +0800
Subject: [PATCH 035/224] [dask] Disable the flaky `worker_left` test. (#11405)

---------

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 tests/test_distributed/test_with_dask/test_with_dask.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index dcfe9f1c0032..78ede7510ad5 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -2191,6 +2191,7 @@ def test_callback(self, client: "Client") -> None:
     clean_kwargs={"processes": False, "threads": False},
     allow_unclosed=True,
 )
+@pytest.mark.skip(reason="dmlc/xgboost#11405: test_worker_left is flaky")
 async def test_worker_left(c: Client, s: Scheduler, a: Worker, b: Worker):
     async with Worker(s.address):
         dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))

From 93cbcab0cc6617dd9140445a2d831dfc9d9651fe Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 15 Apr 2025 07:05:35 -0700
Subject: [PATCH 036/224] [CI] Build xgboost-cpu for manylinux_2_28_x86_64
 (#11406)

* [CI] Build xgboost-cpu for manylinux_2_28

* Update pypi_variants.py for clarity

* Add eol newline

* Use main tag for CI images
---
 .github/workflows/main.yml                    | 90 +++++++++++++------
 ...l.sh => build-python-wheels-arm64-impl.sh} |  0
 ...-arm64.sh => build-python-wheels-arm64.sh} |  8 +-
 ops/pipeline/build-python-wheels-cpu.sh       | 63 +++++++++++++
 ops/pipeline/build-python-wheels-macos.sh     |  1 +
 ...h => build-python-wheels-manylinux2014.sh} | 24 +----
 ops/pipeline/build-win64-cpu.ps1              |  2 +-
 ops/script/change_version.py                  |  2 +-
 ops/script/pypi_variants.py                   | 44 +++++----
 ops/script/release_artifacts.py               |  4 +-
 10 files changed, 162 insertions(+), 76 deletions(-)
 rename ops/pipeline/{build-cpu-arm64-impl.sh => build-python-wheels-arm64-impl.sh} (100%)
 rename ops/pipeline/{build-cpu-arm64.sh => build-python-wheels-arm64.sh} (83%)
 create mode 100644 ops/pipeline/build-python-wheels-cpu.sh
 rename ops/pipeline/{build-manylinux2014.sh => build-python-wheels-manylinux2014.sh} (60%)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 332606cece7b..06d3d12e2d48 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -36,28 +36,6 @@ jobs:
             --prefix cache/${{ github.run_id }}/build-cpu \
             ./xgboost
 
-  build-cpu-arm64:
-    name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel
-    runs-on:
-      - runs-on=${{ github.run_id }}
-      - runner=linux-arm64-cpu
-      - tag=build-cpu-arm64
-    steps:
-      # Restart Docker daemon so that it recognizes the ephemeral disks
-      - run: sudo systemctl restart docker
-      - uses: actions/checkout@v4
-        with:
-          submodules: "true"
-      - name: Log into Docker registry (AWS ECR)
-        run: bash ops/pipeline/login-docker-registry.sh
-      - run: bash ops/pipeline/build-cpu-arm64.sh
-      - name: Stash files
-        run: |
-          python3 ops/pipeline/manage-artifacts.py upload \
-            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cpu-arm64 \
-            ./xgboost python-package/dist/*.whl
-
   build-cuda:
     name: Build CUDA + manylinux_2_28_x86_64 wheel
     runs-on:
@@ -122,13 +100,35 @@ jobs:
           bash ops/pipeline/build-cuda.sh \
             xgb-ci.gpu_build_rockylinux8_dev_ver enable-rmm
 
-  build-manylinux2014:
+  build-python-wheels-arm64:
+    name: Build manylinux_2_28_aarch64 wheel
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-arm64-cpu
+      - tag=build-python-wheels-arm64
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: bash ops/pipeline/build-python-wheels-arm64.sh
+      - name: Stash files
+        run: |
+          python3 ops/pipeline/manage-artifacts.py upload \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-python-wheels-arm64 \
+            ./xgboost python-package/dist/*.whl
+
+  build-python-wheels-manylinux2014:
     name: Build manylinux2014_${{ matrix.arch }} wheel
     runs-on:
       - runs-on
       - runner=${{ matrix.runner }}
       - run-id=${{ github.run_id }}
-      - tag=main-build-manylinux2014-${{ matrix.arch }}
+      - tag=main-build-python-wheels-manylinux2014-${{ matrix.arch }}
     strategy:
       fail-fast: false
       matrix:
@@ -145,7 +145,42 @@ jobs:
           submodules: "true"
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
-      - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }}
+      - run: bash ops/pipeline/build-python-wheels-manylinux2014.sh ${{ matrix.arch }}
+
+  build-python-wheels-cpu:
+    name: Build CPU wheel for ${{ matrix.manylinux_target }}_${{ matrix.arch }}
+    runs-on:
+      - runs-on
+      - runner=${{ matrix.runner }}
+      - run-id=${{ github.run_id }}
+      - tag=main-build-python-wheels-cpu-${{ matrix.manylinux_target }}-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - manylinux_target: manylinux2014
+          arch: aarch64
+          runner: linux-arm64-cpu
+        - manylinux_target: manylinux2014
+          arch: x86_64
+          runner: linux-amd64-cpu
+        - manylinux_target: manylinux_2_28
+          arch: aarch64
+          runner: linux-arm64-cpu
+        - manylinux_target: manylinux_2_28
+          arch: x86_64
+          runner: linux-amd64-cpu
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: |
+          bash ops/pipeline/build-python-wheels-cpu.sh \
+            ${{ matrix.manylinux_target }} ${{ matrix.arch }}
 
   build-gpu-rpkg:
     name: Build GPU-enabled R package
@@ -163,7 +198,6 @@ jobs:
         run: bash ops/pipeline/login-docker-registry.sh
       - run: bash ops/pipeline/build-gpu-rpkg.sh
 
-
   test-cpp-gpu:
     name: >-
       Run Google Tests with GPUs
@@ -208,7 +242,7 @@ jobs:
 
   test-python-wheel:
     name: Run Python tests (${{ matrix.description }})
-    needs: [build-cuda, build-cpu-arm64]
+    needs: [build-cuda, build-python-wheels-arm64]
     runs-on:
       - runs-on
       - runner=${{ matrix.runner }}
@@ -238,7 +272,7 @@ jobs:
             image_repo: xgb-ci.aarch64
             suite: cpu-arm64
             runner: linux-arm64-cpu
-            artifact_from: build-cpu-arm64
+            artifact_from: build-python-wheels-arm64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
diff --git a/ops/pipeline/build-cpu-arm64-impl.sh b/ops/pipeline/build-python-wheels-arm64-impl.sh
similarity index 100%
rename from ops/pipeline/build-cpu-arm64-impl.sh
rename to ops/pipeline/build-python-wheels-arm64-impl.sh
diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-python-wheels-arm64.sh
similarity index 83%
rename from ops/pipeline/build-cpu-arm64.sh
rename to ops/pipeline/build-python-wheels-arm64.sh
index 7630996cebf0..24e102aa97c9 100755
--- a/ops/pipeline/build-cpu-arm64.sh
+++ b/ops/pipeline/build-python-wheels-arm64.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-## Build and test XGBoost with ARM64 CPU
+## Build and test XGBoost with ARM64 CPU (no GPU, no federated learning)
 
 set -euo pipefail
 
@@ -14,13 +14,15 @@ source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
 WHEEL_TAG=manylinux_2_28_aarch64
-IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.aarch64:${IMAGE_TAG}
+IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}
 
 echo "--- Build CPU code targeting ARM64"
 set -x
+
+python3 ops/script/pypi_variants.py --use-cpu-suffix=0 --require-nccl-dep=0
 python3 ops/docker_run.py \
   --image-uri ${IMAGE_URI} \
-  -- ops/pipeline/build-cpu-arm64-impl.sh
+  -- ops/pipeline/build-python-wheels-arm64-impl.sh
 
 echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
 python3 ops/docker_run.py \
diff --git a/ops/pipeline/build-python-wheels-cpu.sh b/ops/pipeline/build-python-wheels-cpu.sh
new file mode 100644
index 000000000000..deab3dce422c
--- /dev/null
+++ b/ops/pipeline/build-python-wheels-cpu.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Build Python wheels, CPU variant (no federated learning)
+
+set -euo pipefail
+
+if [[ -z "${GITHUB_SHA:-}" ]]
+then
+  echo "Make sure to set environment variable GITHUB_SHA"
+  exit 1
+fi
+
+if [[ "$#" -lt 2 ]]
+then
+  echo "Usage: $0 {manylinux2014,manylinux_2_28} {x86_64,aarch64}"
+  exit 1
+fi
+
+manylinux_target="$1"
+arch="$2"
+
+source ops/pipeline/classify-git-branch.sh
+source ops/pipeline/get-docker-registry-details.sh
+source ops/pipeline/get-image-tag.sh
+
+WHEEL_TAG="${manylinux_target}_${arch}"
+IMAGE_REPO="xgb-ci.${WHEEL_TAG}"
+IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
+PYTHON_BIN="/opt/python/cp310-cp310/bin/python"
+
+echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)"
+set -x
+
+# Patch to rename pkg to xgboost-cpu
+python3 ops/script/pypi_variants.py --use-cpu-suffix=1 --require-nccl-dep=0
+python3 ops/docker_run.py \
+  --image-uri "${IMAGE_URI}" \
+  -- bash -c \
+  "cd python-package && ${PYTHON_BIN} -m pip wheel --no-deps -v . --wheel-dir dist/"
+
+python3 ops/docker_run.py \
+  --image-uri "${IMAGE_URI}" \
+  -- auditwheel repair --only-plat \
+  --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl
+python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
+  wheelhouse/xgboost_cpu-*.whl
+rm -v python-package/dist/xgboost_cpu-*.whl
+mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/
+
+if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then
+  echo "error: libgomp.so was not vendored in the wheel"
+  exit -1
+fi
+
+# Check size of wheel
+pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
+
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  python3 ops/pipeline/manage-artifacts.py upload \
+    --s3-bucket xgboost-nightly-builds \
+    --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
+    python-package/dist/*.whl
+fi
diff --git a/ops/pipeline/build-python-wheels-macos.sh b/ops/pipeline/build-python-wheels-macos.sh
index 18ee2bd5b56b..9c04032d8587 100755
--- a/ops/pipeline/build-python-wheels-macos.sh
+++ b/ops/pipeline/build-python-wheels-macos.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Build Python wheels targeting MacOS (no federated learning)
 
 set -euox pipefail
 
diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-python-wheels-manylinux2014.sh
similarity index 60%
rename from ops/pipeline/build-manylinux2014.sh
rename to ops/pipeline/build-python-wheels-manylinux2014.sh
index 03d2f525a4bc..0130634c3c7e 100755
--- a/ops/pipeline/build-manylinux2014.sh
+++ b/ops/pipeline/build-python-wheels-manylinux2014.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Build Python wheels targeting manylinux2014 (no GPU, no federated learning)
 
 set -euo pipefail
 
@@ -28,13 +29,11 @@ PYTHON_BIN="/opt/python/cp310-cp310/bin/python"
 echo "--- Build binary wheel for ${WHEEL_TAG}"
 set -x
 
-python3 ops/script/pypi_variants.py --variant=manylinux2014
+python3 ops/script/pypi_variants.py --use-cpu-suffix=0 --require-nccl-dep=0
 python3 ops/docker_run.py \
   --image-uri "${IMAGE_URI}" \
   -- bash -c \
   "cd python-package && ${PYTHON_BIN} -m pip wheel --no-deps -v . --wheel-dir dist/"
-# discard the patch
-python3 ops/script/pypi_variants.py --variant=default
 
 python3 ops/docker_run.py \
   --image-uri "${IMAGE_URI}" \
@@ -46,25 +45,6 @@ rm -rf python-package/dist/
 mkdir python-package/dist/
 mv -v wheelhouse/*.whl python-package/dist/
 
-echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)"
-# Patch to rename pkg to xgboost-cpu
-python3 ops/script/pypi_variants.py --variant=cpu
-python3 ops/docker_run.py \
-  --image-uri "${IMAGE_URI}" \
-  -- bash -c \
-  "cd python-package && ${PYTHON_BIN} -m pip wheel --no-deps -v . --wheel-dir dist/"
-# discard the patch
-python3 ops/script/pypi_variants.py --variant=default
-
-python3 ops/docker_run.py \
-  --image-uri "${IMAGE_URI}" \
-  -- auditwheel repair --only-plat \
-  --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl
-python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
-  wheelhouse/xgboost_cpu-*.whl
-rm -v python-package/dist/xgboost_cpu-*.whl
-mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/
-
 if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
 then
   python3 ops/pipeline/manage-artifacts.py upload \
diff --git a/ops/pipeline/build-win64-cpu.ps1 b/ops/pipeline/build-win64-cpu.ps1
index f80a12a0d891..99c01460acdc 100644
--- a/ops/pipeline/build-win64-cpu.ps1
+++ b/ops/pipeline/build-win64-cpu.ps1
@@ -18,7 +18,7 @@ Write-Host "--- Build binary wheel"
 cd ..
 # Patch to rename pkg to xgboost-cpu
 conda activate
-python ops/script/pypi_variants.py --variant=cpu
+python ops/script/pypi_variants.py --use-cpu-suffix=1 --require-nccl-dep=0
 if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
 cd python-package
diff --git a/ops/script/change_version.py b/ops/script/change_version.py
index c982ee7e4550..19a8c5f8c24c 100644
--- a/ops/script/change_version.py
+++ b/ops/script/change_version.py
@@ -61,7 +61,7 @@ def pypkg(
     with open(pyprj_path, "w") as fd:
         fd.write(pyprj)
 
-    make_pyproject("default")
+    make_pyproject(use_cpu_suffix=0, require_nccl_dep=1)
 
 
 @cd(R_PACKAGE)
diff --git a/ops/script/pypi_variants.py b/ops/script/pypi_variants.py
index d13b7d62c8ae..f815541a1606 100644
--- a/ops/script/pypi_variants.py
+++ b/ops/script/pypi_variants.py
@@ -8,9 +8,6 @@
 IN_PATH = os.path.join(PY_PACKAGE, "pyproject.toml.in")
 OUT_PATH = os.path.join(PY_PACKAGE, "pyproject.toml")
 
-CHOICES = ["default", "cpu", "manylinux2014"]
-
-
 NCCL_WHL = """    \"nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'\","""
 
 NAME = "{{ name }}"
@@ -24,8 +21,12 @@ def copyfile(src: str, dst: str) -> None:
         fd.write(content)
 
 
-def make_pyproject(variant: str) -> None:
-    assert variant in CHOICES
+def make_pyproject(*, use_cpu_suffix: int, require_nccl_dep: int) -> None:
+    if use_cpu_suffix == 1 and require_nccl_dep == 1:
+        raise ValueError(
+            "xgboost-cpu cannot require NCCL dependency. "
+            "If --use-cpu-suffix=1, you must set --require-nccl-dep=0."
+        )
 
     with open(IN_PATH) as fd:
         pyproject = fd.read()
@@ -33,15 +34,9 @@ def make_pyproject(variant: str) -> None:
     readme_dft = os.path.join(PY_PACKAGE, "README.dft.rst")
     readme_cpu = os.path.join(PY_PACKAGE, "README.cpu.rst")
     readme = os.path.join(PY_PACKAGE, "README.rst")
-    if variant == "cpu":
-        pyproject = pyproject.replace(NAME, "xgboost-cpu").replace(NCCL, "")
-        copyfile(readme_cpu, readme)
-    elif variant == "manylinux2014":
-        pyproject = pyproject.replace(NAME, "xgboost").replace(NCCL, "")
-        copyfile(readme_dft, readme)
-    else:
-        pyproject = pyproject.replace(NAME, "xgboost").replace(NCCL, NCCL_WHL)
-        copyfile(readme_dft, readme)
+    pyproject = pyproject.replace(NAME, "xgboost-cpu" if use_cpu_suffix else "xgboost")
+    copyfile(readme_cpu if use_cpu_suffix else readme_dft, readme)
+    pyproject = pyproject.replace(NCCL, NCCL_WHL if require_nccl_dep else "")
     pyproject = (
         f"# Generated by `{os.path.basename(__file__)}`, don't edit.\n" + pyproject
     )
@@ -53,10 +48,21 @@ def make_pyproject(variant: str) -> None:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--variant",
-        type=str,
-        choices=CHOICES,
-        default="default",
+        "--use-cpu-suffix",
+        type=int,
+        choices=[0, 1],
+        required=True,
+        help="Whether to rename the package name to xgboost-cpu",
+    )
+    parser.add_argument(
+        "--require-nccl-dep",
+        type=int,
+        choices=[0, 1],
+        required=True,
+        help="Whether to require the NCCL dependency",
     )
     args = parser.parse_args()
-    make_pyproject(args.variant)
+    make_pyproject(
+        use_cpu_suffix=args.use_cpu_suffix,
+        require_nccl_dep=args.require_nccl_dep,
+    )
diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
index 52963fac8ded..d36b78dc8b19 100644
--- a/ops/script/release_artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -111,7 +111,7 @@ def make_python_sdist(
 
     # Build sdist for `xgboost-cpu`.
     with DirectoryExcursion(ROOT):
-        make_pyproject("cpu")
+        make_pyproject(use_cpu_suffix=1, require_nccl_dep=0)
     with DirectoryExcursion(ROOT / "python-package"):
         subprocess.run(["python", "-m", "build", "--sdist"], check=True)
         sdist_name = (
@@ -126,7 +126,7 @@ def make_python_sdist(
 
     # Build sdist for `xgboost`.
     with DirectoryExcursion(ROOT):
-        make_pyproject("default")
+        make_pyproject(use_cpu_suffix=0, require_nccl_dep=1)
 
     with DirectoryExcursion(ROOT / "python-package"):
         subprocess.run(["python", "-m", "build", "--sdist"], check=True)

From e539d09763db5fa4f5fddd98c5058ffa8c659b63 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 16 Apr 2025 14:27:05 -0700
Subject: [PATCH 037/224] Update release_artifacts.py to follow #11406 (#11408)

---
 ops/script/release_artifacts.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
index d36b78dc8b19..e4188d809bf2 100644
--- a/ops/script/release_artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -156,6 +156,8 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
         "win_amd64",
         "manylinux2014_x86_64",
         "manylinux2014_aarch64",
+        "manylinux_2_28_x86_64",
+        "manylinux_2_28_aarch64",
     ]
 
     # https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/release_3.0.0/4bfd4bf60d32e2d62426cc4070ccb5a5ba1ed078/xgboost-3.0.0rc1-py3-none-manylinux_2_28_x86_64.whl

From 31bb378a2fcd37ca9b28c585152994c40a53f15e Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Thu, 17 Apr 2025 18:53:47 +0200
Subject: [PATCH 038/224] Use Sphinx Tabs to separate Python/R code in
 tutorials (#11410)

---
 doc/conf.py                           |   1 +
 doc/requirements.txt                  |   1 +
 doc/tutorials/advanced_custom_obj.rst | 895 +++++++++++++-------------
 doc/tutorials/slicing_model.rst       |  88 ++-
 4 files changed, 487 insertions(+), 498 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 6c5c456ac9f8..223c8f70a53d 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -239,6 +239,7 @@ def is_readthedocs_build():
     "sphinx.ext.intersphinx",
     "sphinx_gallery.gen_gallery",
     "sphinx_issues",
+    "sphinx_tabs.tabs",
     "breathe",
     "myst_parser",
 ]
diff --git a/doc/requirements.txt b/doc/requirements.txt
index d73e5bdf2b84..4dc0c2275658 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -12,6 +12,7 @@ myst-parser
 ray[train]
 sphinx-gallery
 sphinx-issues
+sphinx-tabs
 dask
 pyspark
 cloudpickle
diff --git a/doc/tutorials/advanced_custom_obj.rst b/doc/tutorials/advanced_custom_obj.rst
index 5b81b47167e1..540387a1877f 100644
--- a/doc/tutorials/advanced_custom_obj.rst
+++ b/doc/tutorials/advanced_custom_obj.rst
@@ -94,184 +94,181 @@ Where:
 In this case, we want to optimize the negative of the log-likelihood summed across rows.
 The resulting function, gradient and Hessian could be implemented as follows:
 
-.. code-block:: python
-    :caption: Python
-
-    import numpy as np
-    from scipy.special import loggamma, psi as digamma, polygamma
-    trigamma = lambda x: polygamma(1, x)
-
-    def dirichlet_fun(pred: np.ndarray, Y: np.ndarray) -> float:
-        epred = np.exp(pred)
-        sum_epred = np.sum(epred, axis=1, keepdims=True)
-        return (
-            loggamma(epred).sum()
-            - loggamma(sum_epred).sum()
-            - np.sum(np.log(Y) * (epred - 1))
-        )
-    def dirichlet_grad(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
-        epred = np.exp(pred)
-        return epred * (
-            digamma(epred)
-            - digamma(np.sum(epred, axis=1, keepdims=True))
-            - np.log(Y)
-        )
-    def dirichlet_hess(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
-        epred = np.exp(pred)
-        grad = dirichlet_grad(pred, Y)
-        k = Y.shape[1]
-        H = np.empty((pred.shape[0], k, k))
-        for row in range(pred.shape[0]):
-            H[row, :, :] = (
-                - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
-                + np.diag(grad[row] + trigamma(epred[row]) * epred[row] ** 2)
+.. tabs::
+    .. code-tab:: py
+
+        import numpy as np
+        from scipy.special import loggamma, psi as digamma, polygamma
+        trigamma = lambda x: polygamma(1, x)
+
+        def dirichlet_fun(pred: np.ndarray, Y: np.ndarray) -> float:
+            epred = np.exp(pred)
+            sum_epred = np.sum(epred, axis=1, keepdims=True)
+            return (
+                loggamma(epred).sum()
+                - loggamma(sum_epred).sum()
+                - np.sum(np.log(Y) * (epred - 1))
             )
-        return H
-
-.. code-block:: r
-    :caption: R
-
-    softmax <- function(x) {
-        max.x <- max(x)
-        e <- exp(x - max.x)
-        return(e / sum(e))
-    }
-
-    dirichlet.fun <- function(pred, y) {
-        epred <- exp(pred)
-        sum_epred <- rowSums(epred)
-        return(
-            sum(lgamma(epred))
-            - sum(lgamma(sum_epred))
-            - sum(log(y) * (epred - 1))
-        )
-    }
-
-    dirichlet.grad <- function(pred, y) {
-        epred <- exp(pred)
-        return(
-            epred * (
+        def dirichlet_grad(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
+            epred = np.exp(pred)
+            return epred * (
                 digamma(epred)
-                - digamma(rowSums(epred))
-                - log(y)
-            )
-        )
-    }
-
-    dirichlet.hess <- function(pred, y) {
-        epred <- exp(pred)
-        grad <- dirichlet.grad(pred, y)
-        k <- ncol(y)
-        H <- array(dim = c(nrow(y), k, k))
-        for (row in seq_len(nrow(y))) {
-            H[row, , ] <- (
-                - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
-                + diag(grad[row,] + trigamma(epred[row,]) * epred[row,]^2)
+                - digamma(np.sum(epred, axis=1, keepdims=True))
+                - np.log(Y)
             )
+        def dirichlet_hess(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
+            epred = np.exp(pred)
+            grad = dirichlet_grad(pred, Y)
+            k = Y.shape[1]
+            H = np.empty((pred.shape[0], k, k))
+            for row in range(pred.shape[0]):
+                H[row, :, :] = (
+                    - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
+                    + np.diag(grad[row] + trigamma(epred[row]) * epred[row] ** 2)
+                )
+            return H
+
+    .. code-tab:: r R
+
+        softmax <- function(x) {
+            max.x <- max(x)
+            e <- exp(x - max.x)
+            return(e / sum(e))
         }
-        return(H)
-    }
 
-
-Convince yourself that the implementation is correct:
-
-.. code-block:: python
-    :caption: Python
-
-    from math import isclose
-    from scipy import stats
-    from scipy.optimize import check_grad
-    from scipy.special import softmax
-
-    def gen_random_dirichlet(rng: np.random.Generator, m: int, k: int):
-        alpha = np.exp(rng.standard_normal(size=k))
-        return rng.dirichlet(alpha, size=m)
-    
-    def test_dirichlet_fun_grad_hess():
-        k = 3
-        m = 10
-        rng = np.random.default_rng(seed=123)
-        Y = gen_random_dirichlet(rng, m, k)
-        x0 = rng.standard_normal(size=k)
-        for row in range(Y.shape[0]):
-            fun_row = dirichlet_fun(x0.reshape((1,-1)), Y[[row]])
-            ref_logpdf = stats.dirichlet.logpdf(
-                Y[row] / Y[row].sum(), # <- avoid roundoff error
-                np.exp(x0),
+        dirichlet.fun <- function(pred, y) {
+            epred <- exp(pred)
+            sum_epred <- rowSums(epred)
+            return(
+                sum(lgamma(epred))
+                - sum(lgamma(sum_epred))
+                - sum(log(y) * (epred - 1))
             )
-            assert isclose(fun_row, -ref_logpdf)
+        }
 
-            gdiff = check_grad(
-                lambda pred: dirichlet_fun(pred.reshape((1,-1)), Y[[row]]),
-                lambda pred: dirichlet_grad(pred.reshape((1,-1)), Y[[row]]),
-                x0
+        dirichlet.grad <- function(pred, y) {
+            epred <- exp(pred)
+            return(
+                epred * (
+                    digamma(epred)
+                    - digamma(rowSums(epred))
+                    - log(y)
+                )
             )
-            assert gdiff <= 1e-6
-
-            H_numeric = np.empty((k,k))
-            eps = 1e-7
-            for ii in range(k):
-                x0_plus_eps = x0.reshape((1,-1)).copy()
-                x0_plus_eps[0,ii] += eps
-                for jj in range(k):
-                    H_numeric[ii, jj] = (
-                        dirichlet_grad(x0_plus_eps, Y[[row]])[0][jj]
-                        - dirichlet_grad(x0.reshape((1,-1)), Y[[row]])[0][jj]
-                    ) / eps
-            H = dirichlet_hess(x0.reshape((1,-1)), Y[[row]])[0]
-            np.testing.assert_almost_equal(H, H_numeric, decimal=6)
-    test_dirichlet_fun_grad_hess()
+        }
 
+        dirichlet.hess <- function(pred, y) {
+            epred <- exp(pred)
+            grad <- dirichlet.grad(pred, y)
+            k <- ncol(y)
+            H <- array(dim = c(nrow(y), k, k))
+            for (row in seq_len(nrow(y))) {
+                H[row, , ] <- (
+                    - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
+                    + diag(grad[row,] + trigamma(epred[row,]) * epred[row,]^2)
+                )
+            }
+            return(H)
+        }
 
-.. code-block:: r
-    :caption: R
 
-    library(DirichletReg)
-    library(testthat)
+Convince yourself that the implementation is correct:
 
-    test_that("dirichlet formulae", {
-        k <- 3L
-        m <- 10L
-        set.seed(123)
-        alpha <- exp(rnorm(k))
-        y <- rdirichlet(m, alpha)
-        x0 <- rnorm(k)
+.. tabs::
+    .. code-tab:: py
+
+        from math import isclose
+        from scipy import stats
+        from scipy.optimize import check_grad
+        from scipy.special import softmax
+
+        def gen_random_dirichlet(rng: np.random.Generator, m: int, k: int):
+            alpha = np.exp(rng.standard_normal(size=k))
+            return rng.dirichlet(alpha, size=m)
         
-        for (row in seq_len(m)) {
-            logpdf <- dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
-            ref_logpdf <- ddirichlet(y[row,,drop=F], exp(x0), log = T)
-            expect_equal(logpdf, -ref_logpdf)
-            
-            eps <- 1e-7
-            grad_num <- numeric(k)
-            for (col in seq_len(k)) {
-                xplus <- x0
-                xplus[col] <- x0[col] + eps
-                grad_num[col] <- (
-                    dirichlet.fun(matrix(xplus, nrow=1), y[row,,drop=F])
-                    - dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
-                ) / eps
-            }
+        def test_dirichlet_fun_grad_hess():
+            k = 3
+            m = 10
+            rng = np.random.default_rng(seed=123)
+            Y = gen_random_dirichlet(rng, m, k)
+            x0 = rng.standard_normal(size=k)
+            for row in range(Y.shape[0]):
+                fun_row = dirichlet_fun(x0.reshape((1,-1)), Y[[row]])
+                ref_logpdf = stats.dirichlet.logpdf(
+                    Y[row] / Y[row].sum(), # <- avoid roundoff error
+                    np.exp(x0),
+                )
+                assert isclose(fun_row, -ref_logpdf)
+
+                gdiff = check_grad(
+                    lambda pred: dirichlet_fun(pred.reshape((1,-1)), Y[[row]]),
+                    lambda pred: dirichlet_grad(pred.reshape((1,-1)), Y[[row]]),
+                    x0
+                )
+                assert gdiff <= 1e-6
+
+                H_numeric = np.empty((k,k))
+                eps = 1e-7
+                for ii in range(k):
+                    x0_plus_eps = x0.reshape((1,-1)).copy()
+                    x0_plus_eps[0,ii] += eps
+                    for jj in range(k):
+                        H_numeric[ii, jj] = (
+                            dirichlet_grad(x0_plus_eps, Y[[row]])[0][jj]
+                            - dirichlet_grad(x0.reshape((1,-1)), Y[[row]])[0][jj]
+                        ) / eps
+                H = dirichlet_hess(x0.reshape((1,-1)), Y[[row]])[0]
+                np.testing.assert_almost_equal(H, H_numeric, decimal=6)
+        test_dirichlet_fun_grad_hess()
+
+    .. code-tab:: r R
+
+        library(DirichletReg)
+        library(testthat)
+
+        test_that("dirichlet formulae", {
+            k <- 3L
+            m <- 10L
+            set.seed(123)
+            alpha <- exp(rnorm(k))
+            y <- rdirichlet(m, alpha)
+            x0 <- rnorm(k)
             
-            grad <- dirichlet.grad(matrix(x0, nrow=1), y[row,,drop=F])
-            expect_equal(grad |> as.vector(), grad_num, tolerance=1e-6)
-            
-            H_numeric <- array(dim=c(k, k))
-            for (ii in seq_len(k)) {
-                xplus <- x0
-                xplus[ii] <- x0[ii] + eps
-                for (jj in seq_len(k)) {
-                    H_numeric[ii, jj] <- (
-                        dirichlet.grad(matrix(xplus, nrow=1), y[row,,drop=F])[1, jj]
-                        - grad[1L, jj]
+            for (row in seq_len(m)) {
+                logpdf <- dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
+                ref_logpdf <- ddirichlet(y[row,,drop=F], exp(x0), log = T)
+                expect_equal(logpdf, -ref_logpdf)
+                
+                eps <- 1e-7
+                grad_num <- numeric(k)
+                for (col in seq_len(k)) {
+                    xplus <- x0
+                    xplus[col] <- x0[col] + eps
+                    grad_num[col] <- (
+                        dirichlet.fun(matrix(xplus, nrow=1), y[row,,drop=F])
+                        - dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
                     ) / eps
                 }
+                
+                grad <- dirichlet.grad(matrix(x0, nrow=1), y[row,,drop=F])
+                expect_equal(grad |> as.vector(), grad_num, tolerance=1e-6)
+                
+                H_numeric <- array(dim=c(k, k))
+                for (ii in seq_len(k)) {
+                    xplus <- x0
+                    xplus[ii] <- x0[ii] + eps
+                    for (jj in seq_len(k)) {
+                        H_numeric[ii, jj] <- (
+                            dirichlet.grad(matrix(xplus, nrow=1), y[row,,drop=F])[1, jj]
+                            - grad[1L, jj]
+                        ) / eps
+                    }
+                }
+                
+                H <- dirichlet.hess(matrix(xplus, nrow=1), y[row,,drop=F])
+                expect_equal(H[1,,], H_numeric, tolerance=1e-6)
             }
-            
-            H <- dirichlet.hess(matrix(xplus, nrow=1), y[row,,drop=F])
-            expect_equal(H[1,,], H_numeric, tolerance=1e-6)
-        }
-    })
+        })
 
 ******************************************
 Dirichlet Regression as Objective Function
@@ -302,61 +299,60 @@ the expected and true Hessian for Dirichlet will match, which is a nice
 property for optimization (i.e. the Hessian will be positive at a stationary
 point, which means it will be a minimum rather than a maximum or saddle point).
 
-.. code-block:: python
-    :caption: Python
-
-    def dirichlet_expected_hess(pred: np.ndarray) -> np.ndarray:
-        epred = np.exp(pred)
-        k = pred.shape[1]
-        Ehess = np.empty((pred.shape[0], k, k))
-        for row in range(pred.shape[0]):
-            Ehess[row, :, :] = (
-                - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
-                + np.diag(trigamma(epred[row]) * epred[row] ** 2)
-            )
-        return Ehess
-    def test_dirichlet_expected_hess():
-        k = 3
-        rng = np.random.default_rng(seed=123)
-        x0 = rng.standard_normal(size=k)
-        y_sample = rng.dirichlet(np.exp(x0), size=5_000_000)
-        x_broadcast = np.broadcast_to(x0, (y_sample.shape[0], k))
-        g_sample = dirichlet_grad(x_broadcast, y_sample)
-        ref = (g_sample.T @ g_sample) / y_sample.shape[0]
-        Ehess = dirichlet_expected_hess(x0.reshape((1,-1)))[0]
-        np.testing.assert_almost_equal(Ehess, ref, decimal=2)
-    test_dirichlet_expected_hess()
-
-.. code-block:: r
-    :caption: R
-
-    dirichlet.expected.hess <- function(pred) {
-        epred <- exp(pred)
-        k <- ncol(pred)
-        H <- array(dim = c(nrow(pred), k, k))
-        for (row in seq_len(nrow(pred))) {
-            H[row, , ] <- (
-                - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
-                + diag(trigamma(epred[row,]) * epred[row,]^2)
-            )
+.. tabs::
+    .. code-tab:: py
+
+        def dirichlet_expected_hess(pred: np.ndarray) -> np.ndarray:
+            epred = np.exp(pred)
+            k = pred.shape[1]
+            Ehess = np.empty((pred.shape[0], k, k))
+            for row in range(pred.shape[0]):
+                Ehess[row, :, :] = (
+                    - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
+                    + np.diag(trigamma(epred[row]) * epred[row] ** 2)
+                )
+            return Ehess
+        def test_dirichlet_expected_hess():
+            k = 3
+            rng = np.random.default_rng(seed=123)
+            x0 = rng.standard_normal(size=k)
+            y_sample = rng.dirichlet(np.exp(x0), size=5_000_000)
+            x_broadcast = np.broadcast_to(x0, (y_sample.shape[0], k))
+            g_sample = dirichlet_grad(x_broadcast, y_sample)
+            ref = (g_sample.T @ g_sample) / y_sample.shape[0]
+            Ehess = dirichlet_expected_hess(x0.reshape((1,-1)))[0]
+            np.testing.assert_almost_equal(Ehess, ref, decimal=2)
+        test_dirichlet_expected_hess()
+
+    .. code-tab:: r R
+
+        dirichlet.expected.hess <- function(pred) {
+            epred <- exp(pred)
+            k <- ncol(pred)
+            H <- array(dim = c(nrow(pred), k, k))
+            for (row in seq_len(nrow(pred))) {
+                H[row, , ] <- (
+                    - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
+                    + diag(trigamma(epred[row,]) * epred[row,]^2)
+                )
+            }
+            return(H)
         }
-        return(H)
-    }
-
-    test_that("expected hess", {
-        k <- 3L
-        set.seed(123)
-        x0 <- rnorm(k)
-        alpha <- exp(x0)
-        n.samples <- 5e6
-        y.samples <- rdirichlet(n.samples, alpha)
-        
-        x.broadcast <- rep(x0, n.samples) |> matrix(ncol=k, byrow=T)
-        grad.samples <- dirichlet.grad(x.broadcast, y.samples)
-        ref <- crossprod(grad.samples) / n.samples
-        Ehess <- dirichlet.expected.hess(matrix(x0, nrow=1))
-        expect_equal(Ehess[1,,], ref, tolerance=1e-2)
-    })
+
+        test_that("expected hess", {
+            k <- 3L
+            set.seed(123)
+            x0 <- rnorm(k)
+            alpha <- exp(x0)
+            n.samples <- 5e6
+            y.samples <- rdirichlet(n.samples, alpha)
+            
+            x.broadcast <- rep(x0, n.samples) |> matrix(ncol=k, byrow=T)
+            grad.samples <- dirichlet.grad(x.broadcast, y.samples)
+            ref <- crossprod(grad.samples) / n.samples
+            Ehess <- dirichlet.expected.hess(matrix(x0, nrow=1))
+            expect_equal(Ehess[1,,], ref, tolerance=1e-2)
+        })
 
 But note that this is still not usable for XGBoost, since the expected
 Hessian, just like the true Hessian, has shape ``[nrows, k, k]``, while
@@ -376,29 +372,28 @@ of a diagonally dominant matrix:
 That is: take the absolute value of the expected Hessian for each row of the data,
 and sum by rows of the ``[k, k]``-shaped Hessian for that row in the data:
 
-.. code-block:: python
-    :caption: Python
-
-    def dirichlet_diag_upper_bound_expected_hess(
-        pred: np.ndarray, Y: np.ndarray
-    ) -> np.ndarray:
-        Ehess = dirichlet_expected_hess(pred)
-        diag_bound_Ehess = np.empty((pred.shape[0], Y.shape[1]))
-        for row in range(pred.shape[0]):
-            diag_bound_Ehess[row, :] = np.abs(Ehess[row, :, :]).sum(axis=1)
-        return diag_bound_Ehess
-
-.. code-block:: r
-    :caption: R
-
-    dirichlet.diag.upper.bound.expected.hess <- function(pred, y) {
-        Ehess <- dirichlet.expected.hess(pred)
-        diag.bound.Ehess <- array(dim=dim(pred))
-        for (row in seq_len(nrow(pred))) {
-            diag.bound.Ehess[row,] <- abs(Ehess[row,,]) |> rowSums()
+.. tabs::
+    .. code-tab:: py
+
+        def dirichlet_diag_upper_bound_expected_hess(
+            pred: np.ndarray, Y: np.ndarray
+        ) -> np.ndarray:
+            Ehess = dirichlet_expected_hess(pred)
+            diag_bound_Ehess = np.empty((pred.shape[0], Y.shape[1]))
+            for row in range(pred.shape[0]):
+                diag_bound_Ehess[row, :] = np.abs(Ehess[row, :, :]).sum(axis=1)
+            return diag_bound_Ehess
+
+    .. code-tab:: r R
+
+        dirichlet.diag.upper.bound.expected.hess <- function(pred, y) {
+            Ehess <- dirichlet.expected.hess(pred)
+            diag.bound.Ehess <- array(dim=dim(pred))
+            for (row in seq_len(nrow(pred))) {
+                diag.bound.Ehess[row,] <- abs(Ehess[row,,]) |> rowSums()
+            }
+            return(diag.bound.Ehess)
         }
-        return(diag.bound.Ehess)
-    }
 
 (*note: the calculation can be made more efficiently than what is shown here
 by not calculating the full matrix, and in R, by making the rows be the last
@@ -407,60 +402,58 @@ dimension and transposing after the fact*)
 With all these pieces in place, one can now frame this model into the format
 required for XGBoost's custom objectives:
 
-.. code-block:: python
-    :caption: Python
+.. tabs::
+    .. code-tab:: py
 
-    import xgboost as xgb
-    from typing import Tuple
+        import xgboost as xgb
+        from typing import Tuple
 
-    def dirichlet_xgb_objective(
-        pred: np.ndarray, dtrain: xgb.DMatrix
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        Y = dtrain.get_label().reshape(pred.shape)
-        return (
-            dirichlet_grad(pred, Y),
-            dirichlet_diag_upper_bound_expected_hess(pred, Y),
-        )
+        def dirichlet_xgb_objective(
+            pred: np.ndarray, dtrain: xgb.DMatrix
+        ) -> Tuple[np.ndarray, np.ndarray]:
+            Y = dtrain.get_label().reshape(pred.shape)
+            return (
+                dirichlet_grad(pred, Y),
+                dirichlet_diag_upper_bound_expected_hess(pred, Y),
+            )
+
+    .. code-tab:: r R
 
-.. code-block:: r
-    :caption: R
-
-    library(xgboost)
-    
-    dirichlet.xgb.objective <- function(pred, dtrain) {
-        y <- getinfo(dtrain, "label")
-        return(
-            list(
-                grad = dirichlet.grad(pred, y),
-                hess = dirichlet.diag.upper.bound.expected.hess(pred, y)
+        library(xgboost)
+        
+        dirichlet.xgb.objective <- function(pred, dtrain) {
+            y <- getinfo(dtrain, "label")
+            return(
+                list(
+                    grad = dirichlet.grad(pred, y),
+                    hess = dirichlet.diag.upper.bound.expected.hess(pred, y)
+                )
             )
-        )
-    }
+        }
 
 And for an evaluation metric monitoring based on the Dirichlet log-likelihood:
 
-.. code-block:: python
-    :caption: Python
-
-    def dirichlet_eval_metric(
-        pred: np.ndarray, dtrain: xgb.DMatrix
-    ) -> Tuple[str, float]:
-        Y = dtrain.get_label().reshape(pred.shape)
-        return "dirichlet_ll", dirichlet_fun(pred, Y)
-
-.. code-block:: r
-    :caption: R
-
-    dirichlet.eval.metric <- function(pred, dtrain) {
-        y <- getinfo(dtrain, "label")
-        ll <- dirichlet.fun(pred, y)
-        return(
-            list(
-                metric = "dirichlet_ll",
-                value = ll
+.. tabs::
+    .. code-tab:: py
+
+        def dirichlet_eval_metric(
+            pred: np.ndarray, dtrain: xgb.DMatrix
+        ) -> Tuple[str, float]:
+            Y = dtrain.get_label().reshape(pred.shape)
+            return "dirichlet_ll", dirichlet_fun(pred, Y)
+
+    .. code-tab:: r R
+
+        dirichlet.eval.metric <- function(pred, dtrain) {
+            y <- getinfo(dtrain, "label")
+            ll <- dirichlet.fun(pred, y)
+            return(
+                list(
+                    metric = "dirichlet_ll",
+                    value = ll
+                )
             )
-        )
-    }
+        }
 
 *****************
 Practical Example
@@ -478,88 +471,86 @@ lake (sand, silt, clay).
 
 The data:
 
-.. code-block:: python
-    :caption: Python
-    
-    # depth
-    X = np.array([
-        10.4,11.7,12.8,13,15.7,16.3,18,18.7,20.7,22.1,
-        22.4,24.4,25.8,32.5,33.6,36.8,37.8,36.9,42.2,47,
-        47.1,48.4,49.4,49.5,59.2,60.1,61.7,62.4,69.3,73.6,
-        74.4,78.5,82.9,87.7,88.1,90.4,90.6,97.7,103.7,
-    ]).reshape((-1,1))
-    # sand, silt, clay
-    Y = np.array([
-        [0.775,0.195,0.03], [0.719,0.249,0.032], [0.507,0.361,0.132],
-        [0.522,0.409,0.066], [0.7,0.265,0.035], [0.665,0.322,0.013],
-        [0.431,0.553,0.016], [0.534,0.368,0.098], [0.155,0.544,0.301],
-        [0.317,0.415,0.268], [0.657,0.278,0.065], [0.704,0.29,0.006],
-        [0.174,0.536,0.29], [0.106,0.698,0.196], [0.382,0.431,0.187],
-        [0.108,0.527,0.365], [0.184,0.507,0.309], [0.046,0.474,0.48],
-        [0.156,0.504,0.34], [0.319,0.451,0.23], [0.095,0.535,0.37],
-        [0.171,0.48,0.349], [0.105,0.554,0.341], [0.048,0.547,0.41],
-        [0.026,0.452,0.522], [0.114,0.527,0.359], [0.067,0.469,0.464],
-        [0.069,0.497,0.434], [0.04,0.449,0.511], [0.074,0.516,0.409],
-        [0.048,0.495,0.457], [0.045,0.485,0.47], [0.066,0.521,0.413],
-        [0.067,0.473,0.459], [0.074,0.456,0.469], [0.06,0.489,0.451],
-        [0.063,0.538,0.399], [0.025,0.48,0.495], [0.02,0.478,0.502],
-    ])
-
-.. code-block:: r
-    :caption: R
-
-    data("ArcticLake", package="DirichletReg")
-    x <- ArcticLake[, c("depth"), drop=F]
-    y <- ArcticLake[, c("sand", "silt", "clay")] |> as.matrix()
+.. tabs::
+    .. code-tab:: py
+            
+        # depth
+        X = np.array([
+            10.4,11.7,12.8,13,15.7,16.3,18,18.7,20.7,22.1,
+            22.4,24.4,25.8,32.5,33.6,36.8,37.8,36.9,42.2,47,
+            47.1,48.4,49.4,49.5,59.2,60.1,61.7,62.4,69.3,73.6,
+            74.4,78.5,82.9,87.7,88.1,90.4,90.6,97.7,103.7,
+        ]).reshape((-1,1))
+        # sand, silt, clay
+        Y = np.array([
+            [0.775,0.195,0.03], [0.719,0.249,0.032], [0.507,0.361,0.132],
+            [0.522,0.409,0.066], [0.7,0.265,0.035], [0.665,0.322,0.013],
+            [0.431,0.553,0.016], [0.534,0.368,0.098], [0.155,0.544,0.301],
+            [0.317,0.415,0.268], [0.657,0.278,0.065], [0.704,0.29,0.006],
+            [0.174,0.536,0.29], [0.106,0.698,0.196], [0.382,0.431,0.187],
+            [0.108,0.527,0.365], [0.184,0.507,0.309], [0.046,0.474,0.48],
+            [0.156,0.504,0.34], [0.319,0.451,0.23], [0.095,0.535,0.37],
+            [0.171,0.48,0.349], [0.105,0.554,0.341], [0.048,0.547,0.41],
+            [0.026,0.452,0.522], [0.114,0.527,0.359], [0.067,0.469,0.464],
+            [0.069,0.497,0.434], [0.04,0.449,0.511], [0.074,0.516,0.409],
+            [0.048,0.495,0.457], [0.045,0.485,0.47], [0.066,0.521,0.413],
+            [0.067,0.473,0.459], [0.074,0.456,0.469], [0.06,0.489,0.451],
+            [0.063,0.538,0.399], [0.025,0.48,0.495], [0.02,0.478,0.502],
+        ])
+
+    .. code-tab:: r R
+
+        data("ArcticLake", package="DirichletReg")
+        x <- ArcticLake[, c("depth"), drop=F]
+        y <- ArcticLake[, c("sand", "silt", "clay")] |> as.matrix()
 
 Fitting an XGBoost model and making predictions:
 
-.. code-block:: python
-    :caption: Python
-    
-    from typing import Dict, List
-    
-    dtrain = xgb.DMatrix(X, label=Y)
-    results: Dict[str, Dict[str, List[float]]] = {}
-    booster = xgb.train(
-        params={
-            "tree_method": "hist",
-            "num_target": Y.shape[1],
-            "base_score": 0,
-            "disable_default_eval_metric": True,
-            "max_depth": 3,
-            "seed": 123,
-        },
-        dtrain=dtrain,
-        num_boost_round=10,
-        obj=dirichlet_xgb_objective,
-        evals=[(dtrain, "Train")],
-        evals_result=results,
-        custom_metric=dirichlet_eval_metric,
-    )
-    yhat = softmax(booster.inplace_predict(X), axis=1)
-
-.. code-block:: r
-    :caption: R
-
-    dtrain <- xgb.DMatrix(x, y)
-    booster <- xgb.train(
-        params = list(
-            tree_method="hist",
-            num_target=ncol(y),
-            base_score=0,
-            disable_default_eval_metric=TRUE,
-            max_depth=3,
-            seed=123
-        ),
-        data = dtrain,
-        nrounds = 10,
-        obj = dirichlet.xgb.objective,
-        evals = list(Train=dtrain),
-        eval_metric = dirichlet.eval.metric
-    )
-    raw.pred <- predict(booster, x, reshape=TRUE)
-    yhat <- apply(raw.pred, 1, softmax) |> t()
+.. tabs::
+    .. code-tab:: py
+            
+        from typing import Dict, List
+        
+        dtrain = xgb.DMatrix(X, label=Y)
+        results: Dict[str, Dict[str, List[float]]] = {}
+        booster = xgb.train(
+            params={
+                "tree_method": "hist",
+                "num_target": Y.shape[1],
+                "base_score": 0,
+                "disable_default_eval_metric": True,
+                "max_depth": 3,
+                "seed": 123,
+            },
+            dtrain=dtrain,
+            num_boost_round=10,
+            obj=dirichlet_xgb_objective,
+            evals=[(dtrain, "Train")],
+            evals_result=results,
+            custom_metric=dirichlet_eval_metric,
+        )
+        yhat = softmax(booster.inplace_predict(X), axis=1)
+
+    .. code-tab:: r R
+
+        dtrain <- xgb.DMatrix(x, y)
+        booster <- xgb.train(
+            params = list(
+                tree_method="hist",
+                num_target=ncol(y),
+                base_score=0,
+                disable_default_eval_metric=TRUE,
+                max_depth=3,
+                seed=123
+            ),
+            data = dtrain,
+            nrounds = 10,
+            obj = dirichlet.xgb.objective,
+            evals = list(Train=dtrain),
+            eval_metric = dirichlet.eval.metric
+        )
+        raw.pred <- predict(booster, x, reshape=TRUE)
+        yhat <- apply(raw.pred, 1, softmax) |> t()
 
 
 Should produce an evaluation log as follows (note: the function is decreasing as
@@ -604,102 +595,100 @@ For simplicity, this example will nevertheless reuse the same likelihood
 and gradient functions that were defined earlier alongside with SciPy's / R's
 L-BFGS solver to obtain the optimal vector-valued intercept:
 
-.. code-block:: python
-    :caption: Python
+.. tabs::
+    .. code-tab:: py
+
+        from scipy.optimize import minimize
+
+        def get_optimal_intercepts(Y: np.ndarray) -> np.ndarray:
+            k = Y.shape[1]
+            res = minimize(
+                fun=lambda pred: dirichlet_fun(
+                    np.broadcast_to(pred, (Y.shape[0], k)),
+                    Y
+                ),
+                x0=np.zeros(k),
+                jac=lambda pred: dirichlet_grad(
+                    np.broadcast_to(pred, (Y.shape[0], k)),
+                    Y
+                ).sum(axis=0)
+            )
+            return res["x"]
+        intercepts = get_optimal_intercepts(Y)
+
+    .. code-tab:: r R
+
+        get.optimal.intercepts <- function(y) {
+            k <- ncol(y)
+            broadcast.vec <- function(x) rep(x, nrow(y)) |> matrix(ncol=k, byrow=T)
+            res <- optim(
+                par = numeric(k),
+                fn = function(x) dirichlet.fun(broadcast.vec(x), y),
+                gr = function(x) dirichlet.grad(broadcast.vec(x), y) |> colSums(),
+                method = "L-BFGS-B"
+            )
+            return(res$par)
+        }
+        intercepts <- get.optimal.intercepts(y)
 
-    from scipy.optimize import minimize
 
-    def get_optimal_intercepts(Y: np.ndarray) -> np.ndarray:
-        k = Y.shape[1]
-        res = minimize(
-            fun=lambda pred: dirichlet_fun(
-                np.broadcast_to(pred, (Y.shape[0], k)),
-                Y
-            ),
-            x0=np.zeros(k),
-            jac=lambda pred: dirichlet_grad(
-                np.broadcast_to(pred, (Y.shape[0], k)),
-                Y
-            ).sum(axis=0)
+Now fitting a model again, this time with the intercept:
+
+.. tabs::
+    .. code-tab:: py
+
+        base_margin = np.broadcast_to(intercepts, Y.shape)
+        dtrain_w_intercept = xgb.DMatrix(X, label=Y, base_margin=base_margin)
+        results: Dict[str, Dict[str, List[float]]] = {}
+        booster = xgb.train(
+            params={
+                "tree_method": "hist",
+                "num_target": Y.shape[1],
+                "base_score": 0,
+                "disable_default_eval_metric": True,
+                "max_depth": 3,
+                "seed": 123,
+            },
+            dtrain=dtrain_w_intercept,
+            num_boost_round=10,
+            obj=dirichlet_xgb_objective,
+            evals=[(dtrain, "Train")],
+            evals_result=results,
+            custom_metric=dirichlet_eval_metric,
         )
-        return res["x"]
-    intercepts = get_optimal_intercepts(Y)
-
-.. code-block:: r
-    :caption: R
-
-    get.optimal.intercepts <- function(y) {
-        k <- ncol(y)
-        broadcast.vec <- function(x) rep(x, nrow(y)) |> matrix(ncol=k, byrow=T)
-        res <- optim(
-            par = numeric(k),
-            fn = function(x) dirichlet.fun(broadcast.vec(x), y),
-            gr = function(x) dirichlet.grad(broadcast.vec(x), y) |> colSums(),
-            method = "L-BFGS-B"
+        yhat = softmax(
+            booster.predict(
+                xgb.DMatrix(X, base_margin=base_margin)
+            ),
+            axis=1
         )
-        return(res$par)
-    }
-    intercepts <- get.optimal.intercepts(y)
-
-
-Now fitting a model again, this time with the intercept:
 
-.. code-block:: python
-    :caption: Python
-
-    base_margin = np.broadcast_to(intercepts, Y.shape)
-    dtrain_w_intercept = xgb.DMatrix(X, label=Y, base_margin=base_margin)
-    results: Dict[str, Dict[str, List[float]]] = {}
-    booster = xgb.train(
-        params={
-            "tree_method": "hist",
-            "num_target": Y.shape[1],
-            "base_score": 0,
-            "disable_default_eval_metric": True,
-            "max_depth": 3,
-            "seed": 123,
-        },
-        dtrain=dtrain_w_intercept,
-        num_boost_round=10,
-        obj=dirichlet_xgb_objective,
-        evals=[(dtrain, "Train")],
-        evals_result=results,
-        custom_metric=dirichlet_eval_metric,
-    )
-    yhat = softmax(
-        booster.predict(
-            xgb.DMatrix(X, base_margin=base_margin)
-        ),
-        axis=1
-    )
-
-.. code-block:: r
-    :caption: R
-
-    base.margin <- rep(intercepts, nrow(y)) |> matrix(nrow=nrow(y), byrow=T)
-    dtrain <- xgb.DMatrix(x, y, base_margin=base.margin)
-    booster <- xgb.train(
-        params = list(
-            tree_method="hist",
-            num_target=ncol(y),
-            base_score=0,
-            disable_default_eval_metric=TRUE,
-            max_depth=3,
-            seed=123
-        ),
-        data = dtrain,
-        nrounds = 10,
-        obj = dirichlet.xgb.objective,
-        evals = list(Train=dtrain),
-        eval_metric = dirichlet.eval.metric
-    )
-    raw.pred <- predict(
-        booster,
-        x,
-        base_margin=base.margin,
-        reshape=TRUE
-    )
-    yhat <- apply(raw.pred, 1, softmax) |> t()
+    .. code-tab:: r R
+
+        base.margin <- rep(intercepts, nrow(y)) |> matrix(nrow=nrow(y), byrow=T)
+        dtrain <- xgb.DMatrix(x, y, base_margin=base.margin)
+        booster <- xgb.train(
+            params = list(
+                tree_method="hist",
+                num_target=ncol(y),
+                base_score=0,
+                disable_default_eval_metric=TRUE,
+                max_depth=3,
+                seed=123
+            ),
+            data = dtrain,
+            nrounds = 10,
+            obj = dirichlet.xgb.objective,
+            evals = list(Train=dtrain),
+            eval_metric = dirichlet.eval.metric
+        )
+        raw.pred <- predict(
+            booster,
+            x,
+            base_margin=base.margin,
+            reshape=TRUE
+        )
+        yhat <- apply(raw.pred, 1, softmax) |> t()
 
 .. code-block:: none
 
diff --git a/doc/tutorials/slicing_model.rst b/doc/tutorials/slicing_model.rst
index a24545a615c6..0ea0afd9dd81 100644
--- a/doc/tutorials/slicing_model.rst
+++ b/doc/tutorials/slicing_model.rst
@@ -8,53 +8,51 @@ Slice tree model
 When ``booster`` is set to ``gbtree`` or ``dart``, XGBoost builds a tree model, which is a
 list of trees and can be sliced into multiple sub-models.
 
-In Python:
-
-.. code-block:: python
-
-    from sklearn.datasets import make_classification
-    num_classes = 3
-    X, y = make_classification(n_samples=1000, n_informative=5,
-                               n_classes=num_classes)
-    dtrain = xgb.DMatrix(data=X, label=y)
-    num_parallel_tree = 4
-    num_boost_round = 16
-    # total number of built trees is num_parallel_tree * num_classes * num_boost_round
-
-    # We build a boosted random forest for classification here.
-    booster = xgb.train({
-        'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3},
-                        num_boost_round=num_boost_round, dtrain=dtrain)
-
-    # This is the sliced model, containing [3, 7) forests
-    # step is also supported with some limitations like negative step is invalid.
-    sliced: xgb.Booster = booster[3:7]
-
-    # Access individual tree layer
-    trees = [_ for _ in booster]
-    assert len(trees) == num_boost_round
-
-In R:
-
-.. versionadded:: 3.0.0
-
-.. code-block:: R
-
-    data(agaricus.train, package = "xgboost")
-    dm <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-
-    model <- xgb.train(
-      params = xgb.params(objective = "binary:logistic", max_depth = 4),
-      data = dm,
-      nrounds = 20
-    )
-    sliced <- model[seq(3, 7)]
-    ##### xgb.Booster
-    # of features: 126
-    # of rounds:  5
+.. tabs::
+
+    .. code-tab:: py
+
+        import xgboost as xgb
+        from sklearn.datasets import make_classification
+        num_classes = 3
+        X, y = make_classification(n_samples=1000, n_informative=5,
+                                   n_classes=num_classes)
+        dtrain = xgb.DMatrix(data=X, label=y)
+        num_parallel_tree = 4
+        num_boost_round = 16
+        # total number of built trees is num_parallel_tree * num_classes * num_boost_round
+
+        # We build a boosted random forest for classification here.
+        booster = xgb.train({
+            'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3},
+                            num_boost_round=num_boost_round, dtrain=dtrain)
+
+        # This is the sliced model, containing [3, 7) forests
+        # step is also supported with some limitations like negative step is invalid.
+        sliced: xgb.Booster = booster[3:7]
+
+        # Access individual tree layer
+        trees = [_ for _ in booster]
+        assert len(trees) == num_boost_round
+
+    .. code-tab:: r R
+
+        library(xgboost)
+        data(agaricus.train, package = "xgboost")
+        dm <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+
+        model <- xgb.train(
+          params = xgb.params(objective = "binary:logistic", max_depth = 4),
+          data = dm,
+          nrounds = 20
+        )
+        sliced <- model[seq(3, 7)]
+        ##### xgb.Booster
+        # of features: 126
+        # of rounds:  5
 
 The sliced model is a copy of selected trees, that means the model itself is immutable
-during slicing.  This feature is the basis of `save_best` option in early stopping
+during slicing. This feature is the basis of ``save_best`` option in early stopping
 callback. See :ref:`sphx_glr_python_examples_individual_trees.py` for a worked example on
 how to combine prediction with sliced trees.
 

From 4e24639d7de3d8e0aae0ae0ab061c14f704c0c35 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 19 Apr 2025 18:08:09 +0800
Subject: [PATCH 039/224] Allow fallback for CUDA virtual memory. (#11391)

- Check CUDA version based on both driver API and nvidia-smi.
---
 src/common/common.h                    | 15 ++++++-
 src/common/cuda_dr_utils.cc            | 56 ++++++++++++++++++++++++--
 src/common/cuda_dr_utils.h             |  9 ++++-
 src/common/device_helpers.cu           | 18 +++++++--
 src/common/io.cc                       | 27 +++++++++++--
 src/common/io.h                        |  5 ++-
 tests/cpp/common/test_cuda_dr_utils.cc | 23 +++++++++++
 7 files changed, 139 insertions(+), 14 deletions(-)
 create mode 100644 tests/cpp/common/test_cuda_dr_utils.cc

diff --git a/src/common/common.h b/src/common/common.h
index 7cd131e1159a..dfb30ad05d8b 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2024, XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file common.h
  * \brief Common utilities
  */
@@ -62,6 +62,19 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
   return ret;
 }
 
+// Trims leading whitespace from a string
+[[nodiscard]] inline std::string TrimFirst(const std::string &str) {
+  if (str.empty()) {
+    return str;
+  }
+
+  std::size_t first = str.find_first_not_of(" \t\n\r");
+  if (first == std::string::npos) {
+    return "";
+  }
+  return str.substr(first);
+}
+
 /**
  * @brief Add escapes for a UTF-8 string.
  */
diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index 13f2516d408f..d91daa31c2e0 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost contributors
+ * Copyright 2024-2025, XGBoost contributors
  */
 #if defined(XGBOOST_USE_CUDA)
 #include "cuda_dr_utils.h"
@@ -10,11 +10,12 @@
 #include <memory>     // for make_unique
 #include <mutex>      // for call_once
 #include <sstream>    // for stringstream
-#include <string>     // for string
+#include <string>     // for string, stoi
 
-#include "common.h"               // for safe_cuda
+#include "common.h"               // for safe_cuda, TrimFirst, Split
 #include "cuda_rt_utils.h"        // for CurrentDevice
-#include "xgboost/string_view.h"  // for StringVie
+#include "io.h"                   // for CmdOutput
+#include "xgboost/string_view.h"  // for StringView
 
 namespace xgboost::cudr {
 CuDriverApi::CuDriverApi() {
@@ -104,5 +105,52 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
   MakeCuMemLocation(type, &prop.location);
   return prop;
 }
+
+[[nodiscard]] bool GetVersionFromSmi(std::int32_t *p_major, std::int32_t *p_minor) {
+  using ::xgboost::common::Split;
+  using ::xgboost::common::TrimFirst;
+  // `nvidia-smi --version` is not available for older versions, as a result, we can't query the
+  // cuda driver version unless we want to parse the table output.
+
+  // Example output on a 2-GPU system:
+  //
+  // $ nvidia-smi --query-gpu=driver_version --format=csv
+  //
+  // driver_version
+  // 570.124.06
+  // 570.124.06
+  //
+  auto cmd = "nvidia-smi --query-gpu=driver_version --format=csv";
+  auto smi_out_str = common::CmdOutput(StringView{cmd});
+
+  auto Invalid = [=] {
+    *p_major = *p_minor = -1;
+    return false;
+  };
+  if (smi_out_str.empty()) {
+    return Invalid();
+  }
+
+  auto smi_split = Split(smi_out_str, '\n');
+  if (smi_split.size() < 2) {
+    return Invalid();
+  }
+
+  // Use the first GPU
+  auto smi_ver = Split(TrimFirst(smi_split[1]), '.');
+  // 570.124.06
+  if (smi_ver.size() != 3) {
+    return Invalid();
+  }
+  try {
+    *p_major = std::stoi(smi_ver[0]);
+    *p_minor = std::stoi(smi_ver[1]);
+    LOG(INFO) << "Driver version: `" << *p_major << "." << *p_minor << "`";
+    return true;
+  } catch (std::exception const &) {
+  }
+
+  return Invalid();
+}
 }  // namespace xgboost::cudr
 #endif
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
index ae0c9cef1dc7..2c124072f1f8 100644
--- a/src/common/cuda_dr_utils.h
+++ b/src/common/cuda_dr_utils.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost contributors
+ * Copyright 2024-2025, XGBoost contributors
  *
  * @brief Utility for CUDA driver API.
  *
@@ -102,4 +102,11 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
  * @brief Construct a `CUmemAllocationProp`.
  */
 [[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
+
+/**
+ * @brief Get system driver version from the `nvidia-smi` command.
+ *
+ * @return Whether the system call is successful.
+ */
+[[nodiscard]] bool GetVersionFromSmi(std::int32_t *p_major, std::int32_t *p_minor);
 }  // namespace xgboost::cudr
diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu
index 1bc3171a8959..7a1bea789981 100644
--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@@ -3,13 +3,18 @@
  */
 #include <mutex>  // for once_flag, call_once
 
-#include "cuda_rt_utils.h"  // for RtVersion
+#include "../common/cuda_dr_utils.h"  // for GetVersionFromSmi
+#include "cuda_rt_utils.h"            // for RtVersion
 #include "device_helpers.cuh"
 #include "device_vector.cuh"  // for GrowOnlyVirtualMemVec
 #include "xgboost/windefs.h"  // for xgboost_IS_WIN
 
 namespace dh {
 namespace {
+[[nodiscard]] bool IsSupportedDrVer(std::int32_t major, std::int32_t minor) {
+  return major > 12 || (major == 12 && minor >= 5);
+}
+
 // Check whether cuda virtual memory can be used.
 // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable
 [[nodiscard]] bool CheckVmAlloc() {
@@ -19,8 +24,15 @@ namespace {
   std::call_once(once, [] {
     std::int32_t major{0}, minor{0};
     xgboost::curt::DrVersion(&major, &minor);
-    if (major > 12 || (major == 12 && minor >= 5)) {
-      vm_flag = true;
+    if (IsSupportedDrVer(major, minor)) {
+      // The result from the driver api is not reliable. The system driver might not match
+      // the CUDA driver in some obscure cases.
+      //
+      // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
+      // Ver                 Linux       Win
+      // CUDA 12.5 Update 1  >=555.42.06 >=555.85
+      // CUDA 12.5 GA        >=555.42.02 >=555.85
+      vm_flag = xgboost::cudr::GetVersionFromSmi(&major, &minor) && major >= 555;
     } else {
       vm_flag = false;
     }
diff --git a/src/common/io.cc b/src/common/io.cc
index a83c1da3c7f2..1884f478ac7e 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, by XGBoost Contributors
+ * Copyright 2019-2025, by XGBoost Contributors
  */
 #if defined(__unix__) || defined(__APPLE__)
 
@@ -234,7 +234,7 @@ void detail::CloseMmap(MMAPFile* handle) {
   }
 #if defined(xgboost_IS_WIN)
   if (handle->base_ptr) {
-    CHECK(UnmapViewOfFile(handle->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
+    CHECK(UnmapViewOfFile(handle->base_ptr)) << "Failed to call munmap: " << SystemErrorMsg();
   }
   if (handle->fd != INVALID_HANDLE_VALUE) {
     CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << SystemErrorMsg();
@@ -245,11 +245,11 @@ void detail::CloseMmap(MMAPFile* handle) {
 #else
   if (handle->base_ptr) {
     CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1)
-        << "Faled to call munmap: `" << handle->path << "`. " << SystemErrorMsg();
+        << "Failed to call munmap: `" << handle->path << "`. " << SystemErrorMsg();
   }
   if (handle->fd != 0) {
     CHECK_NE(close(handle->fd), -1)
-        << "Faled to close: `" << handle->path << "`. " << SystemErrorMsg();
+        << "Failed to close: `" << handle->path << "`. " << SystemErrorMsg();
   }
 #endif
   delete handle;
@@ -302,4 +302,23 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
 [[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
   return this->pimpl_->Tell();
 }
+
+[[nodiscard]] std::string CmdOutput(StringView cmd) {
+#if defined(xgboost_IS_WIN) || defined(__i386__)
+  (void)cmd;
+  LOG(FATAL) << "Not implemented";
+  return "";
+#else
+  // popen is a convenient method, but it always returns a success even if the command
+  // fails.
+  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
+  CHECK(pipe);
+  std::array<char, 128> buffer;
+  std::string result;
+  while (std::fgets(buffer.data(), static_cast<std::int32_t>(buffer.size()), pipe.get())) {
+    result += buffer.data();
+  }
+  return result;
+#endif
+}
 }  // namespace xgboost::common
diff --git a/src/common/io.h b/src/common/io.h
index e3eaa4faf89b..8eda0935ce60 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2024, XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file io.h
  * \brief general stream interface for serialization, I/O
  * \author Tianqi Chen
@@ -607,5 +607,8 @@ class AlignedMemWriteStream : public AlignedFileWriteStream {
 
   [[nodiscard]] std::size_t Tell() const noexcept(true);
 };
+
+// Run a system command, get its stdout.
+[[nodiscard]] std::string CmdOutput(StringView cmd);
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_IO_H_
diff --git a/tests/cpp/common/test_cuda_dr_utils.cc b/tests/cpp/common/test_cuda_dr_utils.cc
new file mode 100644
index 000000000000..64596b7f371c
--- /dev/null
+++ b/tests/cpp/common/test_cuda_dr_utils.cc
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#if defined(XGBOOST_USE_CUDA) && defined(__linux__)
+#include "../../../src/common/cuda_dr_utils.h"
+
+namespace xgboost::cudr {
+TEST(DrUtils, GetVersionFromSmi) {
+  std::int32_t major = 0, minor = 0;
+  bool result = GetVersionFromSmi(&major, &minor);
+
+  if (result) {
+    EXPECT_GE(major, 0);
+    EXPECT_GE(minor, 0);
+  } else {
+    EXPECT_EQ(major, -1);
+    EXPECT_EQ(minor, -1);
+  }
+}
+}  // namespace xgboost::cudr
+#endif  // defined(XGBOOST_USE_CUDA)

From ea3aaee23193146b84d88103558fd3deefe5fce1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 22 Apr 2025 22:27:55 +0800
Subject: [PATCH 040/224] [dask] Small cleanup for the predict function.
 (#11423)

---
 python-package/xgboost/dask/__init__.py | 44 +++++++++----------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index d1a5c7f27094..425d8841c79d 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -1476,35 +1476,21 @@ async def _predict_async(
         iteration_range: Optional[IterationRange],
     ) -> Any:
         iteration_range = self._get_iteration_range(iteration_range)
-        if self._can_use_inplace_predict():
-            predts = await inplace_predict(
-                client=self.client,
-                model=self.get_booster(),
-                data=data,
-                iteration_range=iteration_range,
-                predict_type="margin" if output_margin else "value",
-                missing=self.missing,
-                base_margin=base_margin,
-                validate_features=validate_features,
-            )
-            if isinstance(predts, dd.DataFrame):
-                predts = predts.to_dask_array()
-        else:
-            test_dmatrix: DaskDMatrix = await DaskDMatrix(
-                self.client,
-                data=data,
-                base_margin=base_margin,
-                missing=self.missing,
-                feature_types=self.feature_types,
-            )
-            predts = await predict(
-                self.client,
-                model=self.get_booster(),
-                data=test_dmatrix,
-                output_margin=output_margin,
-                validate_features=validate_features,
-                iteration_range=iteration_range,
-            )
+        # Dask doesn't support gblinear and accepts only Dask collection types (array
+        # and dataframe). We can perform inplace predict.
+        assert self._can_use_inplace_predict()
+        predts = await inplace_predict(
+            client=self.client,
+            model=self.get_booster(),
+            data=data,
+            iteration_range=iteration_range,
+            predict_type="margin" if output_margin else "value",
+            missing=self.missing,
+            base_margin=base_margin,
+            validate_features=validate_features,
+        )
+        if isinstance(predts, dd.DataFrame):
+            predts = predts.to_dask_array()
         return predts
 
     @_deprecate_positional_args

From 68862052678e2b8c7f89d0550bda94159c3b3a59 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 23 Apr 2025 04:13:28 +0800
Subject: [PATCH 041/224] [doc] Update the external memory document. (#11412)

Provide more details on what to use.
---
 demo/guide-python/external_memory.py |   4 +-
 doc/tutorials/external_memory.rst    | 120 +++++++++++++++++----------
 python-package/xgboost/core.py       |   6 +-
 3 files changed, 79 insertions(+), 51 deletions(-)

diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index 207e2ce68911..d076c5a848a9 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -110,7 +110,7 @@ def reset(self) -> None:
 
 def hist_train(it: Iterator) -> None:
     """The hist tree method can use a special data structure `ExtMemQuantileDMatrix` for
-    faster initialization and lower memory usage.
+    faster initialization and lower memory usage (recommended).
 
     .. versionadded:: 3.0.0
 
@@ -128,7 +128,7 @@ def hist_train(it: Iterator) -> None:
 
 
 def approx_train(it: Iterator) -> None:
-    """The approx tree method uses the basic `DMatrix`."""
+    """The approx tree method uses the basic `DMatrix` (not recommended)."""
 
     # For non-data arguments, specify it here once instead of passing them by the `next`
     # method.
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index bbdd9f20df2b..023ebcce7a07 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -2,33 +2,43 @@
 Using XGBoost External Memory Version
 #####################################
 
+********
+Overview
+********
+
 When working with large datasets, training XGBoost models can be challenging as the entire
-dataset needs to be loaded into memory. This can be costly and sometimes
-infeasible. Starting from 1.5, users can define a custom iterator to load data in chunks
-for running XGBoost algorithms. External memory can be used for training and prediction,
-but training is the primary use case and it will be our focus in this tutorial. For
-prediction and evaluation, users can iterate through the data themselves, whereas training
-requires the entire dataset to be loaded into the memory. Significant progress was made in
-the 3.0 release for the GPU implementation. We will introduce the difference between CPU
-and GPU in the following sections.
+dataset needs to be loaded into the main memory. This can be costly and sometimes
+infeasible.
+
+External memory training is sometimes called out-of-core training. It refers to the
+capability that XGBoost can optionally cache data in a location external to the main
+processor, be it CPU or GPU. XGBoost doesn't support network file systems by itself. As a
+result, for CPU, the external memory usually refers to a harddrive. And for GPU, it refers
+to either the host memory or a harddrive.
+
+Users can define a custom iterator to load data in chunks for running XGBoost
+algorithms. External memory can be used for training and prediction, but training is the
+primary use case and it will be our focus in this tutorial. For prediction and evaluation,
+users can iterate through the data themselves, whereas training requires the entire
+dataset to be loaded into the memory. During model training, XGBoost fetches the cache in
+batches to construct the decision trees, hence avoiding loading the entire dataset into
+the main memory and achieve better vertical scaling (scaling within the same node).
+
+Significant progress was made in the 3.0 release for the GPU implementation. We will
+introduce the difference between CPU and GPU in the following sections.
 
 .. note::
 
-   Training on data from external memory is not supported by the ``exact`` tree method.
+   Training on data from external memory is not supported by the ``exact`` tree method. We
+   recommend using the default ``hist`` tree method for performance reasons.
 
 .. note::
 
    The feature is considered experimental but ready for public testing in 3.0. Vector-leaf
    is not yet supported.
 
-The external memory support has undergone multiple development iterations. Like the
-:py:class:`~xgboost.QuantileDMatrix` with :py:class:`~xgboost.DataIter`, XGBoost loads
-data batch-by-batch using a custom iterator supplied by the user. However, unlike the
-:py:class:`~xgboost.QuantileDMatrix`, external memory does not concatenate the batches
-(unless specified by the ``extmem_single_page``) . Instead, it caches all batches in the
-external memory and fetch them on-demand. Go to the end of the document to see a
-comparison between :py:class:`~xgboost.QuantileDMatrix` and the external memory version of
-:py:class:`~xgboost.ExtMemQuantileDMatrix`.
+The external memory support has undergone multiple development iterations. See below
+sections for a brief history.
 
 **Contents**
 
@@ -36,15 +46,24 @@ comparison between :py:class:`~xgboost.QuantileDMatrix` and the external memory
   :backlinks: none
   :local:
 
+
 *************
 Data Iterator
 *************
 
-Starting with XGBoost 1.5, users can define their own data loader using Python or C
-interface. Some examples are in the ``demo`` directory for a quick start. To enable
-external memory training, users need to define a data iterator with 2 class methods:
-``next`` and ``reset``, then pass it into the :py:class:`~xgboost.DMatrix` or the
-:py:class:`~xgboost.ExtMemQuantileDMatrix` constructor.
+To start using the external memory, users need define a data iterator. The data iterator
+interface was added to the Python and C interfaces in 1.5, and to the R interface in
+3.0.0. Like the :py:class:`~xgboost.QuantileDMatrix` with :py:class:`~xgboost.DataIter`,
+XGBoost loads data batch-by-batch using the custom iterator supplied by the user. However,
+unlike the :py:class:`~xgboost.QuantileDMatrix`, external memory does not concatenate the
+batches (unless specified by the ``extmem_single_page`` for GPU) . Instead, it caches all
+batches in the external memory and fetch them on-demand. Go to the end of the document to
+see a comparison between :py:class:`~xgboost.QuantileDMatrix` and the external memory
+version of :py:class:`~xgboost.ExtMemQuantileDMatrix`.
+
+Some examples are in the ``demo`` directory for a quick start. To enable external memory
+training, the custom data iterator needs to have two class methods: ``next`` and
+``reset``.
 
 .. code-block:: python
 
@@ -83,9 +102,14 @@ external memory training, users need to define a data iterator with 2 class meth
       """Reset the iterator to its beginning"""
       self._it = 0
 
+After defining the iterator, we can to pass it into the :py:class:`~xgboost.DMatrix` or
+the :py:class:`~xgboost.ExtMemQuantileDMatrix` constructor:
+
+.. code-block:: python
+
   it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
 
-  # Use the ``ExtMemQuantileDMatrix`` for the hist tree method.
+  # Use the ``ExtMemQuantileDMatrix`` for the hist tree method, recommended.
   Xy = xgboost.ExtMemQuantileDMatrix(it)
   booster = xgboost.train({"tree_method": "hist"}, Xy)
 
@@ -117,12 +141,12 @@ GPU Version (GPU Hist tree method)
 External memory is supported by GPU algorithms (i.e., when ``device`` is set to
 ``cuda``). Starting with 3.0, the default GPU implementation is similar to what the CPU
 version does. It also supports the use of :py:class:`~xgboost.ExtMemQuantileDMatrix` when
-the ``hist`` tree method is employed. For a GPU device, the main memory is the device
-memory, whereas the external memory can be either a disk or the CPU memory. XGBoost stages
-the cache on CPU memory by default. Users can change the backing storage to disk by
+the ``hist`` tree method is employed (default). For a GPU device, the main memory is the
+device memory, whereas the external memory can be either a disk or the CPU memory. XGBoost
+stages the cache on CPU memory by default. Users can change the backing storage to disk by
 specifying the ``on_host`` parameter in the :py:class:`~xgboost.DataIter`. However, using
 the disk is not recommended as it's likely to make the GPU slower than the CPU. The option
-is here for experimental purposes only. In addition,
+is here for experimentation purposes only. In addition,
 :py:class:`~xgboost.ExtMemQuantileDMatrix` parameters ``max_num_device_pages``,
 ``min_cache_page_bytes``, and ``max_quantile_batches`` can help control the data placement
 and memory usage.
@@ -138,8 +162,8 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
 
     # It's important to use RMM for GPU-based external memory to improve performance.
     # If XGBoost is not built with RMM support, a warning will be raised.
-    # We use the pool memory resource here, you can also try the `ArenaMemoryResource` for
-    # improved memory fragmentation handling.
+    # We use the pool memory resource here for simplicity, you can also try the
+    `ArenaMemoryResource` for # improved memory fragmentation handling.
     mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
     rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
@@ -175,7 +199,8 @@ and ``min_cache_page_bytes``, they are automatically configured based on the dev
 don't change model accuracy. However, the ``max_quantile_batches`` can be useful if
 :py:class:`~xgboost.ExtMemQuantileDMatrix` is running out of device memory during
 construction, see :py:class:`~xgboost.QuantileDMatrix` and the following sections for more
-info.
+info. Currently, we focus on devices with ``NVLink-C2C`` support for GPU-based external
+memory support.
 
 In addition to the batch-based data fetching, the GPU version supports concatenating
 batches into a single blob for the training data to improve performance. For GPUs
@@ -218,11 +243,14 @@ through the input data twice, as a result, the most significant overhead compare
 in-core training is one additional data read when the data is dense. Please note that
 there are multiple variants of the platform and they come with different C2C
 bandwidths. During initial development of the feature, we used the LPDDR5 480G version,
-which has about 350GB/s bandwidth for host to device transfer.
+which has about 350GB/s bandwidth for host to device transfer. When choosing the variant
+for training XGBoost models, one should pay extra attention to the C2C bandwidth.
 
 To run experiments on these platforms, the open source `NVIDIA Linux driver
 <https://developer.nvidia.com/blog/nvidia-transitions-fully-towards-open-source-gpu-kernel-modules/>`__
-with version ``>=565.47`` is required, it should come with CTK 12.7 and later versions.
+with version ``>=565.47`` is required, it should come with CTK 12.7 and later
+versions. Lastly, there's a known issue with Linux 6.11 that can lead to CUDA host memory
+allocation failure with an ``invalid argument`` error.
 
 ********************
 Distributed Training
@@ -248,7 +276,7 @@ of tree nodes with only a few batch iterations. Conversely, using the ``lossguid
 requires XGBoost to iterate over the data set for each tree node, resulting in
 significantly slower performance.
 
-In addition, this ``hist`` tree method should be preferred over the ``approx`` tree method
+In addition, the ``hist`` tree method should be preferred over the ``approx`` tree method
 as the former doesn't recreate the histogram bins for every iteration. Creating the
 histogram bins requires loading the raw input data, which is prohibitively expensive. The
 :py:class:`~xgboost.ExtMemQuantileDMatrix` designed for the ``hist`` tree method can speed
@@ -258,9 +286,8 @@ Since the external memory implementation focuses on training where XGBoost needs
 the entire dataset, only the ``X`` is divided into batches while everything else is
 concatenated. As a result, it's recommended for users to define their own management code
 to iterate through the data for inference, especially for SHAP value computation. The size
-of SHAP results can be larger than ``X``, making external memory in XGBoost less
-effective. Some frameworks like ``dask`` can help with the data chunking and iterate
-through the data for inference with memory spilling.
+of SHAP matrix can be larger than the feature matrix ``X``, making external memory in
+XGBoost less effective.
 
 When external memory is used, the performance of CPU training is limited by disk IO
 (input/output) speed. This means that the disk IO speed primarily determines the training
@@ -269,10 +296,12 @@ used as a cache and address translation services (ATS) is unavailable. During de
 we observed that typical data transfer in XGBoost with PCIe4x16 has about 24GB/s
 bandwidth, which is significantly lower than the GPU processing performance. Whereas with
 a C2C-enabled machine, the performance of data transfer and processing in training are
-similar. Running inference is much less computation-intensive than training and, hence,
-much faster. As a result, the performance bottleneck of inference is back to data
-transfer. For GPU, the time it takes to read the data from host to device completely
-determines the time it takes to run inference, even if a C2C link is available.
+close to each other.
+
+Running inference is much less computation-intensive than training and, hence, much
+faster. As a result, the performance bottleneck of inference is back to data transfer. For
+GPU, the time it takes to read the data from host to device completely determines the time
+it takes to run inference, even if a C2C link is available.
 
 .. code-block:: python
 
@@ -284,11 +313,10 @@ subject to memory fragmentation even if the :py:class:`~rmm.mr.CudaAsyncMemoryRe
 used. You might want to start the training with a fresh pool instead of starting training
 right after the ETL process. If you run into out-of-memory errors and you are convinced
 that the pool is not full yet (pool memory usage can be profiled with ``nsight-system``),
-consider tuning the RMM memory resource like using
-:py:class:`~rmm.mr.CudaAsyncMemoryResource` in conjunction with
+consider using the :py:class:`~rmm.mr.ArenaMemoryResource` memory resource. Alternatively,
+using :py:class:`~rmm.mr.CudaAsyncMemoryResource` in conjunction with
 :py:class:`BinningMemoryResource(mr, 21, 25) <rmm.mr.BinningMemoryResource>` instead of
-the :py:class:`~rmm.mr.PoolMemoryResource`. Alternately, the
-:py:class:`~rmm.mr.ArenaMemoryResource` is also an excellent option.
+the default :py:class:`~rmm.mr.PoolMemoryResource` can be an option.
 
 During CPU benchmarking, we used an NVMe connected to a PCIe-4 slot. Other types of
 storage can be too slow for practical usage. However, your system will likely perform some
@@ -337,8 +365,8 @@ so far focuses on following fronts of optimization for external memory:
 - If the OS can cache the data, the performance should be close to in-core training.
 - For GPU, the actual computation should overlap with memory copy as much as possible.
 
-Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It has not
-been tested against system errors like disconnected network devices (`SIGBUS`). In the
+Starting with XGBoost 2.0, the CPU implementation of external memory uses ``mmap``. It has
+not been tested against system errors like disconnected network devices (`SIGBUS`). In the
 face of a bus error, you will see a hard crash and need to clean up the cache files. If
 the training session might take a long time and you use solutions like NVMe-oF, we
 recommend checkpointing your model periodically. Also, it's worth noting that most tests
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 6b6b12e91983..813a0480eca3 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1651,9 +1651,9 @@ class QuantileDMatrix(DMatrix, _RefMixIn):
         For GPU-based inputs from an iterator, XGBoost handles incoming batches with
         multiple growing substreams. This parameter sets the maximum number of batches
         before XGBoost can cut the sub-stream and create a new one. This can help bound
-        the memory usage. By default, XGBoost grows new sub-streams exponentially until
-        batches are exhausted. Only used for the training dataset and the default is
-        None (unbounded). Lastly, if the `data` is a single batch instead of an
+        the memory usage. By default, XGBoost grows a sub-stream exponentially until
+        batches are exhausted. This option is only used for the training dataset and the
+        default is None (unbounded). Lastly, if the `data` is a single batch instead of an
         iterator, this parameter has no effect.
 
         .. versionadded:: 3.0.0

From 39a37ab2a4a28e063ac7226b20f4cbc0ababe53a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 24 Apr 2025 00:46:01 +0800
Subject: [PATCH 042/224] [EM] Add file read stream. (#11426)

- Add a new stream backend that directly reads files in addition to the existing `mmap`.
- Some refactoring to the `nvtx` wrappers.
---
 src/common/cuda_rt_utils.h                    |  36 ------
 src/common/io.cc                              |  38 +++++-
 src/common/io.h                               |  20 +++
 src/common/nvtx_utils.h                       |  53 ++++++++
 src/common/timer.cc                           |   8 +-
 src/data/extmem_quantile_dmatrix.cu           |   8 +-
 src/data/gradient_index_format.cc             |  12 +-
 src/data/proxy_dmatrix.h                      |  14 +--
 src/data/quantile_dmatrix.cu                  |   2 +-
 src/data/sparse_page_source.h                 |  36 ++++--
 src/tree/hist/histogram.h                     |   7 +-
 src/tree/updater_quantile_hist.cc             |   8 +-
 tests/cpp/common/test_io.cc                   | 117 ++++++++++--------
 .../test_gradient_index_page_raw_format.cc    |  13 +-
 14 files changed, 243 insertions(+), 129 deletions(-)
 create mode 100644 src/common/nvtx_utils.h

diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
index 7c80d9cf96f5..ad029bbeee6c 100644
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@@ -5,10 +5,6 @@
 #include <cstddef>  // for size_t
 #include <cstdint>  // for int32_t
 
-#if defined(XGBOOST_USE_NVTX)
-#include <nvtx3/nvtx3.hpp>
-#endif  // defined(XGBOOST_USE_NVTX)
-
 namespace xgboost::curt {
 std::int32_t AllVisibleGPUs();
 
@@ -38,36 +34,4 @@ void RtVersion(std::int32_t* major, std::int32_t* minor);
 
 // Returns the latest version of CUDA supported by the driver.
 void DrVersion(std::int32_t* major, std::int32_t* minor);
-
-struct NvtxDomain {
-  static constexpr char const *name{"libxgboost"};  // NOLINT
-};
-
-#if defined(XGBOOST_USE_NVTX)
-using NvtxScopedRange = ::nvtx3::scoped_range_in<NvtxDomain>;
-using NvtxEventAttr = ::nvtx3::event_attributes;
-using NvtxRgb = ::nvtx3::rgb;
-#else
-class NvtxScopedRange {
- public:
-  template <typename... Args>
-  explicit NvtxScopedRange(Args &&...) {}
-};
-class NvtxEventAttr {
- public:
-  template <typename... Args>
-  explicit NvtxEventAttr(Args &&...) {}
-};
-class NvtxRgb {
- public:
-  template <typename... Args>
-  explicit NvtxRgb(Args &&...) {}
-};
-#endif  // defined(XGBOOST_USE_NVTX)
 }  // namespace xgboost::curt
-
-#if defined(XGBOOST_USE_NVTX)
-#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain)
-#else
-#define xgboost_NVTX_FN_RANGE()
-#endif  // defined(XGBOOST_USE_NVTX)
diff --git a/src/common/io.cc b/src/common/io.cc
index 1884f478ac7e..c18ed32593ca 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -3,7 +3,7 @@
  */
 #if defined(__unix__) || defined(__APPLE__)
 
-#include <fcntl.h>     // for open, O_RDONLY
+#include <fcntl.h>     // for open, O_RDONLY, posix_fadvise
 #include <sys/mman.h>  // for mmap, munmap, madvise
 #include <unistd.h>    // for close, getpagesize
 
@@ -22,6 +22,7 @@
 #include <cerrno>        // for errno
 #include <cstddef>       // for size_t
 #include <cstdint>       // for int32_t, uint32_t
+#include <cstdio>        // for fread, fseek
 #include <cstring>       // for memcpy
 #include <filesystem>    // for filesystem, weakly_canonical
 #include <fstream>       // for ifstream
@@ -280,6 +281,37 @@ MmapResource::~MmapResource() noexcept(false) = default;
 AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {}  // NOLINT
 PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {}        // NOLINT
 
+std::shared_ptr<MallocResource> MemBufFileReadStream::ReadFileIntoBuffer(StringView path,
+                                                                         std::size_t offset,
+                                                                         std::size_t length) {
+  CHECK(std::filesystem::exists(path.c_str())) << "`" << path << "` doesn't exist";
+  auto res = std::make_shared<MallocResource>(length);
+  auto ptr = res->DataAs<char>();
+  std::unique_ptr<FILE, std::function<int(FILE*)>> fp{fopen(path.c_str(), "rb"), fclose};
+
+  auto err = [&] {
+    auto e = SystemErrorMsg();
+    LOG(FATAL) << "Failed to read file `" << path << "`. System error message: " << e;
+  };
+#if defined(__linux__)
+  auto fd = fileno(fp.get());
+  if (fd == -1) {
+    err();
+  }
+  if (posix_fadvise(fd, offset, length, POSIX_FADV_SEQUENTIAL) != 0) {
+    LOG(FATAL) << SystemErrorMsg();
+  }
+#endif  // defined(__linux__)
+
+  if (fseek(fp.get(), offset, SEEK_SET) != 0) {
+    err();
+  }
+  if (fread(ptr, length, 1, fp.get()) != 1) {
+    err();
+  }
+  return res;
+}
+
 AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
     : pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
 
@@ -304,14 +336,14 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
 }
 
 [[nodiscard]] std::string CmdOutput(StringView cmd) {
-#if defined(xgboost_IS_WIN) || defined(__i386__)
+#if defined(xgboost_IS_WIN)
   (void)cmd;
   LOG(FATAL) << "Not implemented";
   return "";
 #else
   // popen is a convenient method, but it always returns a success even if the command
   // fails.
-  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
+  std::unique_ptr<FILE, std::function<int(FILE*)>> pipe(popen(cmd.c_str(), "r"), pclose);
   CHECK(pipe);
   std::array<char, 128> buffer;
   std::string result;
diff --git a/src/common/io.h b/src/common/io.h
index 8eda0935ce60..0bca0cb8cac6 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -546,6 +546,26 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
   ~PrivateMmapConstStream() noexcept(false) override;
 };
 
+/**
+ * @brief Read a portion of a file into a memory buffer. This class helps integration with
+ *        external memory file format.
+ */
+class MemBufFileReadStream : public AlignedResourceReadStream {
+  static std::shared_ptr<MallocResource> ReadFileIntoBuffer(StringView path, std::size_t offset,
+                                                            std::size_t length);
+
+ public:
+  /**
+   * @brief Construct a stream for reading file.
+   *
+   * @param path      File path.
+   * @param offset    The number of bytes into the file.
+   * @param length    The number of bytes to read.
+   */
+  explicit MemBufFileReadStream(StringView path, std::size_t offset, std::size_t length)
+      : AlignedResourceReadStream{ReadFileIntoBuffer(path, offset, length)} {}
+};
+
 /**
  * @brief Base class for write stream with alignment defined by IOAlignment().
  */
diff --git a/src/common/nvtx_utils.h b/src/common/nvtx_utils.h
new file mode 100644
index 000000000000..3b27dc0664f6
--- /dev/null
+++ b/src/common/nvtx_utils.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2024-2025, XGBoost contributors
+ */
+#pragma once
+
+#if defined(XGBOOST_USE_NVTX)
+#include <nvtx3/nvtx3.hpp>
+#endif  // defined(XGBOOST_USE_NVTX)
+
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::nvtx {
+struct Domain {
+  static constexpr char const* name{"libxgboost"};  // NOLINT
+};
+
+#if defined(XGBOOST_USE_NVTX)
+using ScopedRange = ::nvtx3::scoped_range_in<Domain>;
+using EventAttr = ::nvtx3::event_attributes;
+using Rgb = ::nvtx3::rgb;
+
+inline auto MakeScopedRange(StringView name, Rgb color) {
+  ::nvtx3::v1::registered_string_in<Domain> const scope_name{name.c_str()};
+  ::nvtx3::v1::event_attributes const scope_attr{scope_name, color};
+  return ::nvtx3::v1::scoped_range_in<Domain>{scope_attr};
+}
+
+#else
+class ScopedRange {
+ public:
+  template <typename... Args>
+  explicit ScopedRange(Args&&...) {}
+};
+class EventAttr {
+ public:
+  template <typename... Args>
+  explicit EventAttr(Args&&...) {}
+};
+class Rgb {
+ public:
+  template <typename... Args>
+  explicit Rgb(Args&&...) {}
+};
+
+inline auto MakeScopedRange(StringView, Rgb) { return ScopedRange{}; }
+#endif  // defined(XGBOOST_USE_NVTX)
+}  // namespace xgboost::nvtx
+
+#if defined(XGBOOST_USE_NVTX)
+#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::nvtx::Domain)
+#else
+#define xgboost_NVTX_FN_RANGE()
+#endif  // defined(XGBOOST_USE_NVTX)
diff --git a/src/common/timer.cc b/src/common/timer.cc
index a105f7a4a4e4..35c779f12c82 100644
--- a/src/common/timer.cc
+++ b/src/common/timer.cc
@@ -1,12 +1,12 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include "timer.h"
 
 #include <utility>
 
 #include "../collective/communicator-inl.h"
-#include "cuda_rt_utils.h"
+#include "nvtx_utils.h"  // for Domain
 
 #if defined(XGBOOST_USE_NVTX)
 #include <nvtx3/nvtx3.hpp>
@@ -18,7 +18,7 @@ void Monitor::Start(std::string const &name) {
     auto &stats = statistics_map_[name];
     stats.timer.Start();
 #if defined(XGBOOST_USE_NVTX)
-    auto range_handle = nvtx3::start_range_in<curt::NvtxDomain>(label_ + "::" + name);
+    auto range_handle = nvtx3::start_range_in<nvtx::Domain>(label_ + "::" + name);
     stats.nvtx_id = range_handle.get_value();
 #endif  // defined(XGBOOST_USE_NVTX)
   }
@@ -30,7 +30,7 @@ void Monitor::Stop(const std::string &name) {
     stats.timer.Stop();
     stats.count++;
 #if defined(XGBOOST_USE_NVTX)
-    nvtx3::end_range_in<curt::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
+    nvtx3::end_range_in<nvtx::Domain>(nvtx3::range_handle{stats.nvtx_id});
 #endif  // defined(XGBOOST_USE_NVTX)
   }
 }
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
index a633ac984e89..5959be040b95 100644
--- a/src/data/extmem_quantile_dmatrix.cu
+++ b/src/data/extmem_quantile_dmatrix.cu
@@ -16,14 +16,14 @@
 #include <memory>   // for shared_ptr
 #include <variant>  // for visit, get_if
 
-#include "../common/cuda_rt_utils.h"  // for xgboost_NVTX_FN_RANGE
-#include "batch_utils.h"              // for CheckParam, RegenGHist
-#include "ellpack_page.cuh"           // for EllpackPage
+#include "../common/nvtx_utils.h"  // for xgboost_NVTX_FN_RANGE
+#include "batch_utils.h"           // for CheckParam, RegenGHist
+#include "batch_utils.h"           // for AutoCachePageBytes
+#include "ellpack_page.cuh"        // for EllpackPage
 #include "extmem_quantile_dmatrix.h"
 #include "proxy_dmatrix.h"    // for DataIterProxy
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"     // for BatchParam
-#include "batch_utils.h"      // for AutoCachePageBytes
 
 namespace xgboost::data {
 [[nodiscard]] std::int64_t DftMinCachePageBytes(std::int64_t min_cache_page_bytes) {
diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc
index cd012937208e..75df6c37f33d 100644
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,21 +1,23 @@
 /**
- * Copyright 2021-2024, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include "gradient_index_format.h"
 
-#include <cstddef>                        // for size_t
-#include <cstdint>                        // for uint8_t
-#include <type_traits>                    // for underlying_type_t
-#include <vector>                         // for vector
+#include <cstddef>      // for size_t
+#include <cstdint>      // for uint8_t
+#include <type_traits>  // for underlying_type_t
+#include <vector>       // for vector
 
 #include "../common/hist_util.h"          // for HistogramCuts
 #include "../common/io.h"                 // for AlignedResourceReadStream
+#include "../common/nvtx_utils.h"         // for xgboost_NVTX_FN_RANGE
 #include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
 #include "gradient_index.h"               // for GHistIndexMatrix
 
 namespace xgboost::data {
 [[nodiscard]] bool GHistIndexRawFormat::Read(GHistIndexMatrix* page,
                                              common::AlignedResourceReadStream* fi) {
+  xgboost_NVTX_FN_RANGE();
   CHECK(fi);
 
   page->Cuts() = this->cuts_;
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 78c070621a34..9fdd5eef3488 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -11,13 +11,13 @@
 #include <type_traits>  // for invoke_result_t, declval
 #include <vector>       // for vector
 
-#include "../common/cuda_rt_utils.h"  // for xgboost_NVTX_FN_RANGE
-#include "../encoder/ordinal.h"       // for HostColumnsView
-#include "adapter.h"                  // for ColumnarAdapter, ArrayAdapter
-#include "xgboost/c_api.h"            // for DataIterHandle
-#include "xgboost/context.h"          // for Context
-#include "xgboost/data.h"             // for MetaInfo
-#include "xgboost/string_view.h"      // for StringView
+#include "../common/nvtx_utils.h"  // for xgboost_NVTX_FN_RANGE
+#include "../encoder/ordinal.h"    // for HostColumnsView
+#include "adapter.h"               // for ColumnarAdapter, ArrayAdapter
+#include "xgboost/c_api.h"         // for DataIterHandle
+#include "xgboost/context.h"       // for Context
+#include "xgboost/data.h"          // for MetaInfo
+#include "xgboost/string_view.h"   // for StringView
 
 namespace xgboost::data {
 /**
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index 4452d5e9212a..be13c260406b 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -10,10 +10,10 @@
 #include "../collective/allreduce.h"    // for Allreduce
 #include "../common/cuda_context.cuh"   // for CUDAContext
 #include "../common/cuda_rt_utils.h"    // for AllVisibleGPUs
-#include "../common/cuda_rt_utils.h"    // for xgboost_NVTX_FN_RANGE
 #include "../common/device_vector.cuh"  // for XGBCachingDeviceAllocator
 #include "../common/error_msg.h"        // for InconsistentCategories
 #include "../common/hist_util.cuh"      // for AdapterDeviceSketch
+#include "../common/nvtx_utils.h"       // for xgboost_NVTX_FN_RANGE
 #include "../common/quantile.cuh"       // for SketchContainer
 #include "cat_container.h"              // for CatContainer
 #include "ellpack_page.cuh"             // for EllpackPage
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index ba2574481b0b..531b477b55c7 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -178,6 +178,17 @@ class ExceHandler {
   }
 };
 
+template <typename WriterT>
+std::unique_ptr<WriterT> DftCreateWriterImpl(StringView name, std::uint32_t iter) {
+  std::unique_ptr<common::AlignedFileWriteStream> fo;
+  if (iter == 0) {
+    fo = std::make_unique<common::AlignedFileWriteStream>(name, "wb");
+  } else {
+    fo = std::make_unique<common::AlignedFileWriteStream>(name, "ab");
+  }
+  return fo;
+}
+
 /**
  * @brief Default implementation of the stream creater.
  */
@@ -189,13 +200,7 @@ class DefaultFormatStreamPolicy : public F<S> {
 
  public:
   std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter) {
-    std::unique_ptr<common::AlignedFileWriteStream> fo;
-    if (iter == 0) {
-      fo = std::make_unique<common::AlignedFileWriteStream>(name, "wb");
-    } else {
-      fo = std::make_unique<common::AlignedFileWriteStream>(name, "ab");
-    }
-    return fo;
+    return DftCreateWriterImpl<WriterT>(name, iter);
   }
 
   std::unique_ptr<ReaderT> CreateReader(StringView name, std::uint64_t offset,
@@ -204,6 +209,23 @@ class DefaultFormatStreamPolicy : public F<S> {
   }
 };
 
+template <typename S, template <typename> typename F>
+class MemBufFileReadFormatStreamPolicy : public F<S> {
+ public:
+  using WriterT = common::AlignedFileWriteStream;
+  using ReaderT = common::AlignedResourceReadStream;
+
+ public:
+  std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter) {
+    return DftCreateWriterImpl<WriterT>(name, iter);
+  }
+
+  std::unique_ptr<ReaderT> CreateReader(StringView name, std::uint64_t offset,
+                                        std::uint64_t length) const {
+    return std::make_unique<common::MemBufFileReadStream>(std::string{name}, offset, length);
+  }
+};
+
 /**
  * @brief Default implementatioin of the format creator.
  */
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 441d1a01530a..b41b07365c4b 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
 #define XGBOOST_TREE_HIST_HISTOGRAM_H_
@@ -41,10 +41,11 @@ void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candi
 
 class HistogramBuilder {
   /*! \brief culmulative histogram of gradients. */
+  common::Monitor monitor_;
   BoundedHistCollection hist_;
   common::ParallelGHistBuilder buffer_;
   BatchParam param_;
-  int32_t n_threads_{-1};
+  std::int32_t n_threads_{-1};
   // Whether XGBoost is running in distributed environment.
   bool is_distributed_{false};
   bool is_col_split_{false};
@@ -147,6 +148,7 @@ class HistogramBuilder {
                  GHistIndexMatrix const &gidx, common::RowSetCollection const &row_set_collection,
                  std::vector<bst_node_t> const &nodes_to_build,
                  linalg::VectorView<GradientPair const> gpair, bool force_read_by_column = false) {
+    monitor_.Start(__func__);
     CHECK(gpair.Contiguous());
 
     if (page_idx == 0) {
@@ -167,6 +169,7 @@ class HistogramBuilder {
       this->BuildLocalHistograms<true>(space, gidx, nodes_to_build, row_set_collection,
                                        gpair.Values(), force_read_by_column);
     }
+    monitor_.Stop(__func__);
   }
 
   void SyncHistogram(Context const *ctx, RegTree const *p_tree,
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 2575302239e6..cc503f90f8ff 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost Contributors
+ * Copyright 2017-2025, XGBoost Contributors
  * \file updater_quantile_hist.cc
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -50,10 +50,10 @@ DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
 BatchParam HistBatch(TrainParam const *param) { return {param->max_bin, param->sparse_threshold}; }
 
 template <typename ExpandEntry, typename Updater>
-void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const> gpair,
+void UpdateTree(common::Monitor *monitor, linalg::MatrixView<GradientPair const> gpair,
                 Updater *updater, DMatrix *p_fmat, TrainParam const *param,
                 HostDeviceVector<bst_node_t> *p_out_position, RegTree *p_tree) {
-  monitor_->Start(__func__);
+  monitor->Start(__func__);
   updater->InitData(p_fmat, p_tree);
 
   Driver<ExpandEntry> driver{*param};
@@ -105,7 +105,7 @@ void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const
 
   auto &h_out_position = p_out_position->HostVector();
   updater->LeafPartition(tree, gpair, &h_out_position);
-  monitor_->Stop(__func__);
+  monitor->Stop(__func__);
 }
 
 /**
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index e7f72dc27f71..365edbb32eb9 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -153,71 +153,80 @@ TEST(IO, Resource) {
   }
 }
 
-TEST(IO, PrivateMmapStream) {
-  dmlc::TemporaryDirectory tempdir;
-  auto path = tempdir.path + "/testfile";
+class TestFileStream : public ::testing::Test {
+ public:
+  template <typename TestStreamT>
+  void Run() {
+    dmlc::TemporaryDirectory tempdir;
+    auto path = tempdir.path + "/testfile";
 
-  // The page size on Linux is usually set to 4096, while the allocation granularity on
-  // the Windows machine where this test is writted is 65536. We span the test to cover
-  // all of them.
-  std::size_t n_batches{64};
-  std::size_t multiplier{2048};
+    // The page size on Linux is usually set to 4096, while the allocation granularity on
+    // the Windows machine where this test is writted is 65536. We span the test to cover
+    // all of them.
+    std::size_t n_batches{64};
+    std::size_t multiplier{2048};
 
-  std::vector<std::vector<std::int32_t>> batches;
-  std::vector<std::size_t> offset{0ul};
+    std::vector<std::vector<std::int32_t>> batches;
+    std::vector<std::size_t> offset{0ul};
 
-  using T = std::int32_t;
+    using T = std::int32_t;
 
-  {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (std::size_t i = 0; i < n_batches; ++i) {
-      std::size_t size = (i + 1) * multiplier;
-      std::vector<T> data(size, 0);
-      std::iota(data.begin(), data.end(), i * i);
+    {
+      std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+      for (std::size_t i = 0; i < n_batches; ++i) {
+        std::size_t size = (i + 1) * multiplier;
+        std::vector<T> data(size, 0);
+        std::iota(data.begin(), data.end(), i * i);
 
-      fo->Write(static_cast<std::uint64_t>(data.size()));
-      fo->Write(data.data(), data.size() * sizeof(T));
+        fo->Write(static_cast<std::uint64_t>(data.size()));
+        fo->Write(data.data(), data.size() * sizeof(T));
 
-      std::size_t bytes = sizeof(std::uint64_t) + data.size() * sizeof(T);
-      offset.push_back(bytes);
+        std::size_t bytes = sizeof(std::uint64_t) + data.size() * sizeof(T);
+        offset.push_back(bytes);
 
-      batches.emplace_back(std::move(data));
+        batches.emplace_back(std::move(data));
+      }
     }
-  }
-
-  // Turn size info offset
-  std::partial_sum(offset.begin(), offset.end(), offset.begin());
 
-  // Test read
-  for (std::size_t i = 0; i < n_batches; ++i) {
-    std::size_t off = offset[i];
-    std::size_t n = offset.at(i + 1) - offset[i];
-    auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
-    std::vector<T> data;
+    // Turn size info offset
+    std::partial_sum(offset.begin(), offset.end(), offset.begin());
 
-    std::uint64_t size{0};
-    ASSERT_TRUE(fi->Read(&size));
-    ASSERT_EQ(fi->Tell(), sizeof(size));
-    data.resize(size);
+    // Test read
+    for (std::size_t i = 0; i < n_batches; ++i) {
+      std::size_t off = offset[i];
+      std::size_t n = offset.at(i + 1) - offset[i];
+      auto fi{std::make_unique<TestStreamT>(path, off, n)};
+      std::vector<T> data;
+
+      std::uint64_t size{0};
+      ASSERT_TRUE(fi->Read(&size));
+      ASSERT_EQ(fi->Tell(), sizeof(size));
+      data.resize(size);
+
+      ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
+      ASSERT_EQ(data, batches[i]);
+    }
 
-    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
-    ASSERT_EQ(data, batches[i]);
+    // Test consume
+    for (std::size_t i = 0; i < n_batches; ++i) {
+      std::size_t off = offset[i];
+      std::size_t n = offset.at(i + 1) - offset[i];
+      std::unique_ptr<AlignedResourceReadStream> fi{
+          std::make_unique<TestStreamT>(path, off, n)};
+      std::vector<T> data;
+
+      std::uint64_t size{0};
+      ASSERT_TRUE(fi->Consume(&size));
+      ASSERT_EQ(fi->Tell(), sizeof(size));
+      data.resize(size);
+
+      ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
+      ASSERT_EQ(data, batches[i]);
+    }
   }
+};
 
-  // Test consume
-  for (std::size_t i = 0; i < n_batches; ++i) {
-    std::size_t off = offset[i];
-    std::size_t n = offset.at(i + 1) - offset[i];
-    std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
-    std::vector<T> data;
+TEST_F(TestFileStream, PrivateMmapStream) { this->Run<PrivateMmapConstStream>(); }
 
-    std::uint64_t size{0};
-    ASSERT_TRUE(fi->Consume(&size));
-    ASSERT_EQ(fi->Tell(), sizeof(size));
-    data.resize(size);
-
-    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
-    ASSERT_EQ(data, batches[i]);
-  }
-}
+TEST_F(TestFileStream, MemBufFileReadStream) { this->Run<MemBufFileReadStream>(); }
 }  // namespace xgboost::common
diff --git a/tests/cpp/data/test_gradient_index_page_raw_format.cc b/tests/cpp/data/test_gradient_index_page_raw_format.cc
index 2c2a4b1b1aab..3fa57235130f 100644
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -10,8 +10,9 @@
 #include "../../../src/common/column_matrix.h"  // for common::ColumnMatrix
 #include "../../../src/common/io.h"             // for MmapResource, AlignedResourceReadStream...
 #include "../../../src/data/gradient_index.h"   // for GHistIndexMatrix
-#include "../../../src/data/gradient_index_format.h"  // for GHistIndexRawFormat
-#include "../helpers.h"                               // for RandomDataGenerator
+#include "../../../src/data/gradient_index_format.h"       // for GHistIndexRawFormat
+#include "../../../src/data/gradient_index_page_source.h"  // for GHistIndexFormatPolicy
+#include "../helpers.h"                                    // for RandomDataGenerator
 
 namespace xgboost::data {
 TEST(GHistIndexPageRawFormat, IO) {
@@ -59,4 +60,12 @@ TEST(GHistIndexPageRawFormat, IO) {
     ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
   }
 }
+
+TEST(GHistIndexPageRawFormat, File) {
+  auto policy = MemBufFileReadFormatStreamPolicy<GHistIndexMatrix, GHistIndexFormatPolicy>{};
+
+  std::string path = "ghist.page";
+  ASSERT_THAT([&] { policy.CreateReader(StringView{path}, static_cast<bst_idx_t>(0), 0); },
+              GMockThrow("doesn't exist"));
+}
 }  // namespace xgboost::data

From f41be2cef10eaaa450271016355c41e3a9125502 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 25 Apr 2025 16:56:17 +0800
Subject: [PATCH 043/224] Replace nan with denormal floating point. (#11428)

- Remove nan as the init value for tree models.
- Avoid zero division in linear models.
---
 src/linear/coordinate_common.h      | 19 ++++++++++++-------
 src/linear/updater_coordinate.cc    | 12 ++++++------
 src/tree/io_utils.h                 | 11 +++++++----
 src/tree/multi_target_tree_model.cc |  6 +++---
 src/tree/tree_model.cc              | 23 +++++++++++------------
 tests/cpp/tree/test_tree_model.cc   |  5 +++--
 tests/cpp/tree/test_tree_stat.cc    |  9 +++++----
 tests/python/test_linear.py         |  6 ++++--
 8 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/src/linear/coordinate_common.h b/src/linear/coordinate_common.h
index f08856bd1bfc..60959f122d00 100644
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -1,20 +1,21 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  * \author Rory Mitchell
  */
 #pragma once
 #include <algorithm>
+#include <cmath>  // for fpclassify
+#include <limits>
 #include <string>
 #include <utility>
 #include <vector>
-#include <limits>
 
-#include "xgboost/data.h"
-#include "xgboost/parameter.h"
-#include "./param.h"
-#include "../gbm/gblinear_model.h"
 #include "../common/random.h"
 #include "../common/threading_utils.h"
+#include "../gbm/gblinear_model.h"
+#include "./param.h"
+#include "xgboost/data.h"
+#include "xgboost/parameter.h"
 
 namespace xgboost {
 namespace linear {
@@ -64,7 +65,11 @@ inline double CoordinateDelta(double sum_grad, double sum_hess, double w,
  * \return  The weight update.
  */
 inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
-  return -sum_grad / sum_hess;
+  auto b = -sum_grad / sum_hess;
+  if (std::isnan(b) || std::isinf(b)) {
+    b = 0;
+  }
+  return b;
 }
 
 /**
diff --git a/src/linear/updater_coordinate.cc b/src/linear/updater_coordinate.cc
index 0d61d7c7cb00..709d1fc8772d 100644
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  * \author Rory Mitchell
  */
 
@@ -49,13 +49,13 @@ class CoordinateUpdater : public LinearUpdater {
               double sum_instance_weight) override {
     auto gpair = in_gpair->Data();
     tparam_.DenormalizePenalties(sum_instance_weight);
-    const int ngroup = model->learner_model_param->num_output_group;
+    auto ngroup = model->learner_model_param->num_output_group;
     // update bias
-    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
+    for (decltype(ngroup) group_idx = 0; group_idx < ngroup; ++group_idx) {
       auto grad = GetBiasGradientParallel(group_idx, ngroup, gpair->ConstHostVector(), p_fmat,
                                           ctx_->Threads());
-      auto dbias = static_cast<float>(tparam_.learning_rate *
-                                      CoordinateDeltaBias(grad.first, grad.second));
+      auto dbias =
+          static_cast<float>(tparam_.learning_rate * CoordinateDeltaBias(grad.first, grad.second));
       model->Bias()[group_idx] += dbias;
       UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &gpair->HostVector(), p_fmat);
     }
@@ -63,7 +63,7 @@ class CoordinateUpdater : public LinearUpdater {
     selector_->Setup(ctx_, *model, gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
                      tparam_.reg_lambda_denorm, cparam_.top_k);
     // update weights
-    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
+    for (decltype(ngroup) group_idx = 0; group_idx < ngroup; ++group_idx) {
       for (unsigned i = 0U; i < model->learner_model_param->num_feature; i++) {
         int fidx =
             selector_->NextFeature(ctx_, i, *model, group_idx, gpair->ConstHostVector(), p_fmat,
diff --git a/src/tree/io_utils.h b/src/tree/io_utils.h
index 7a8f055fe7c9..1dc17c08c063 100644
--- a/src/tree/io_utils.h
+++ b/src/tree/io_utils.h
@@ -1,11 +1,12 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_IO_UTILS_H_
 #define XGBOOST_TREE_IO_UTILS_H_
-#include <string>          // for string
-#include <type_traits>     // for enable_if_t, is_same_v, conditional_t
-#include <vector>          // for vector
+#include <limits>       // for numeric_limits
+#include <string>       // for string
+#include <type_traits>  // for enable_if_t, is_same_v, conditional_t
+#include <vector>       // for vector
 
 #include "xgboost/json.h"  // for Json
 
@@ -59,5 +60,7 @@ inline std::string const kParent{"parents"};
 inline std::string const kLeft{"left_children"};
 inline std::string const kRight{"right_children"};
 }  // namespace tree_field
+
+constexpr float DftBadValue() { return std::numeric_limits<float>::denorm_min(); }
 }  // namespace xgboost
 #endif  // XGBOOST_TREE_IO_UTILS_H_
diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc
index 7f3087fd6f06..7c040382ef22 100644
--- a/src/tree/multi_target_tree_model.cc
+++ b/src/tree/multi_target_tree_model.cc
@@ -25,8 +25,8 @@ MultiTargetTree::MultiTargetTree(TreeParam const* param)
       parent_(1ul, InvalidNodeId()),
       split_index_(1ul, 0),
       default_left_(1ul, 0),
-      split_conds_(1ul, std::numeric_limits<float>::quiet_NaN()),
-      weights_(param->size_leaf_vector, std::numeric_limits<float>::quiet_NaN()) {
+      split_conds_(1ul, DftBadValue()),
+      weights_(param->size_leaf_vector, DftBadValue()) {
   CHECK_GT(param_->size_leaf_vector, 1);
 }
 
@@ -222,7 +222,7 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl
   split_index_.Resize(n);
   split_index_.HostVector()[nidx] = split_idx;
 
-  split_conds_.Resize(n, std::numeric_limits<float>::quiet_NaN());
+  split_conds_.Resize(n, DftBadValue());
   split_conds_.HostVector()[nidx] = split_cond;
 
   default_left_.Resize(n);
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 0639233510f7..6502cc707f81 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -891,24 +891,23 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
   this->param_.num_nodes = this->p_mt_tree_->Size();
 }
 
-void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
-                                common::Span<const uint32_t> split_cat, bool default_left,
-                                bst_float base_weight, bst_float left_leaf_weight,
-                                bst_float right_leaf_weight, bst_float loss_change, float sum_hess,
-                                float left_sum, float right_sum) {
+void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
+                                common::Span<common::KCatBitField::value_type> split_cat,
+                                bool default_left, bst_float base_weight,
+                                bst_float left_leaf_weight, bst_float right_leaf_weight,
+                                bst_float loss_change, float sum_hess, float left_sum,
+                                float right_sum) {
   CHECK(!IsMultiTarget());
-  this->ExpandNode(nid, split_index, std::numeric_limits<float>::quiet_NaN(),
-                   default_left, base_weight,
-                   left_leaf_weight, right_leaf_weight, loss_change, sum_hess,
-                   left_sum, right_sum);
+  this->ExpandNode(nidx, split_index, DftBadValue(), default_left, base_weight, left_leaf_weight,
+                   right_leaf_weight, loss_change, sum_hess, left_sum, right_sum);
 
   size_t orig_size = split_categories_.size();
   this->split_categories_.resize(orig_size + split_cat.size());
   std::copy(split_cat.data(), split_cat.data() + split_cat.size(),
             split_categories_.begin() + orig_size);
-  this->split_types_.at(nid) = FeatureType::kCategorical;
-  this->split_categories_segments_.at(nid).beg = orig_size;
-  this->split_categories_segments_.at(nid).size = split_cat.size();
+  this->split_types_.at(nidx) = FeatureType::kCategorical;
+  this->split_categories_segments_.at(nidx).beg = orig_size;
+  this->split_categories_segments_.at(nidx).size = split_cat.size();
 }
 
 void RegTree::Load(dmlc::Stream* fi) {
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 2491f3973f9a..ac3444f75c33 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -1,10 +1,11 @@
 /**
- * Copyright 2018-2024, XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../src/common/bitfield.h"
 #include "../../../src/common/categorical.h"
+#include "../../../src/tree/io_utils.h"  // for DftBadValue
 #include "../filesystem.h"
 #include "../helpers.h"
 #include "xgboost/tree_model.h"
@@ -166,7 +167,7 @@ TEST(Tree, ExpandCategoricalFeature) {
     ASSERT_EQ(tree.GetSplitTypes()[1], FeatureType::kNumerical);
     ASSERT_EQ(tree.GetSplitTypes()[2], FeatureType::kNumerical);
     ASSERT_EQ(tree.GetSplitCategories().size(), 0ul);
-    ASSERT_TRUE(std::isnan(tree[0].SplitCond()));
+    ASSERT_EQ(tree[0].SplitCond(), DftBadValue());
   }
   {
     RegTree tree;
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index 5f0646f22276..15b59368499f 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>       // for Context
@@ -9,7 +9,8 @@
 
 #include <memory>  // for unique_ptr
 
-#include "../../../src/tree/param.h"  // for TrainParam
+#include "../../../src/tree/io_utils.h"  // for DftBadValue
+#include "../../../src/tree/param.h"     // for TrainParam
 #include "../helpers.h"
 
 namespace xgboost {
@@ -127,8 +128,8 @@ class TestSplitWithEta : public ::testing::Test {
           for (std::size_t i = 0; i < leaf_0.Size(); ++i) {
             CHECK_EQ(leaf_0(i) * eta_ratio, leaf_1(i));
           }
-          CHECK(std::isnan(p_tree0->SplitCond(nidx)));
-          CHECK(std::isnan(p_tree1->SplitCond(nidx)));
+          CHECK_EQ(DftBadValue(), p_tree0->SplitCond(nidx));
+          CHECK_EQ(DftBadValue(), p_tree1->SplitCond(nidx));
         } else {
           // NON-mt tree reuses split cond for leaf value.
           auto leaf_0 = p_tree0->SplitCond(nidx);
diff --git a/tests/python/test_linear.py b/tests/python/test_linear.py
index 5d281d4152f1..b3d573537d7f 100644
--- a/tests/python/test_linear.py
+++ b/tests/python/test_linear.py
@@ -1,3 +1,5 @@
+from typing import Dict
+
 from hypothesis import given, note, settings, strategies
 
 import xgboost as xgb
@@ -20,8 +22,8 @@
 })
 
 
-def train_result(param, dmat, num_rounds):
-    result = {}
+def train_result(param: dict, dmat: xgb.DMatrix, num_rounds: int) -> Dict[str, Dict]:
+    result: Dict[str, Dict] = {}
     xgb.train(
         param,
         dmat,

From d11419616ab93b395c7550f241c727a76da8e272 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 1 May 2025 17:19:18 +0800
Subject: [PATCH 044/224] [dask] Workarounds for different Dask versions.
 (#11436)

---------

Co-authored-by: TomAugspurger <toaugspurger@nvidia.com>
---
 python-package/xgboost/dask/__init__.py | 31 ++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 425d8841c79d..7f38647faa2f 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -55,7 +55,7 @@
 import logging
 from collections import defaultdict
 from contextlib import contextmanager
-from functools import partial, update_wrapper
+from functools import cache, partial, update_wrapper
 from threading import Thread
 from typing import (
     Any,
@@ -85,6 +85,8 @@
 from dask import dataframe as dd
 from dask.delayed import Delayed
 from distributed import Future
+from packaging.version import Version
+from packaging.version import parse as parse_version
 
 from .. import collective, config
 from .._typing import FeatureNames, FeatureTypes, IterationRange
@@ -171,6 +173,21 @@
 LOGGER = logging.getLogger("[xgboost.dask]")
 
 
+@cache
+def _DASK_VERSION() -> Version:
+    return parse_version(dask.__version__)
+
+
+@cache
+def _DASK_2024_12_1() -> bool:
+    return _DASK_VERSION() >= parse_version("2024.12.1")
+
+
+@cache
+def _DASK_2025_3_0() -> bool:
+    return _DASK_VERSION() >= parse_version("2025.3.0")
+
+
 def _try_start_tracker(
     n_workers: int,
     addrs: List[Union[Optional[str], Optional[Tuple[str, int]]]],
@@ -1491,6 +1508,18 @@ async def _predict_async(
         )
         if isinstance(predts, dd.DataFrame):
             predts = predts.to_dask_array()
+            # Make sure the booster is part of the task graph implicitly
+            # only needed for certain versions of dask.
+            if _DASK_2024_12_1() and not _DASK_2025_3_0():
+                # Fixes this issue for dask>=2024.1.1,<2025.3.0
+                # Dask==2025.3.0 fails with:
+                #     RuntimeError: Attempting to use an asynchronous
+                #     Client in a synchronous context of `dask.compute`
+                #
+                # Dask==2025.4.0 fails with:
+                #     TypeError: Value type is not supported for data
+                #     iterator:<class 'distributed.client.Future'>
+                predts = predts.persist()
         return predts
 
     @_deprecate_positional_args

From 2fad970c38fbbfe5aa4b9011cdb47b5ae3775efe Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 6 May 2025 04:07:17 +0800
Subject: [PATCH 045/224] Specialize for GPU dense histogram. (#11443)

---
 src/data/ellpack_page.cu            | 27 ++++------
 src/data/ellpack_page.cuh           | 44 +++++++++++----
 src/data/ellpack_page_source.h      |  2 +-
 src/tree/gpu_hist/histogram.cu      | 83 ++++++++++++++++++-----------
 tests/cpp/data/test_ellpack_page.cu | 28 ++++++++++
 5 files changed, 124 insertions(+), 60 deletions(-)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index c3926cff15f7..c5ea61066ee4 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -685,15 +685,10 @@ std::size_t EllpackPageImpl::MemCostBytes() const {
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
     Context const* ctx, common::Span<FeatureType const> feature_types) const {
-  auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
-  return {ctx,
-          this->cuts_,
-          this->info.row_stride,
-          this->base_rowid,
-          this->n_rows,
-          common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()},
-          null,
-          feature_types};
+  auto null = this->NullValue();
+  auto iter = common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()};
+  return {ctx,  this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+          iter, null,        this->IsDense(),       feature_types};
 }
 
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
@@ -705,15 +700,11 @@ EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
   dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
                                 cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
   Context cpu_ctx;
-  auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
-  return {ctx->IsCPU() ? ctx : &cpu_ctx,
-          this->cuts_,
-          this->info.row_stride,
-          this->base_rowid,
-          this->n_rows,
-          common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()},
-          null,
-          feature_types};
+  auto null = this->NullValue();
+  auto iter = common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()};
+  auto sctx = ctx->IsCPU() ? ctx : &cpu_ctx;
+  return {sctx, this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+          iter, null,        this->IsDense(),       feature_types};
 }
 
 [[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing(
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index fc0340875645..6e3858ba527d 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -20,11 +20,20 @@ namespace xgboost {
 /**
  * @brief Struct for accessing and manipulating an ELLPACK matrix on the device.
  *
- * Does not own underlying memory and may be trivially copied into kernels.
+ * Does not own the underlying memory and may be trivially copied into kernels.
  */
 struct EllpackDeviceAccessor {
-  /** @brief Whether or not if the matrix is dense. */
-  bst_idx_t null_value;
+ private:
+  /**
+   * @brief Stores the null value and whether the matrix is dense. The `IsDense` is stored in the
+   * first bit of this value.
+   */
+  bst_idx_t null_value_;
+
+  constexpr static auto Ind() { return static_cast<bst_idx_t>(1); }
+  constexpr static std::size_t NullShift() { return sizeof(null_value_) * 8 - Ind(); }
+
+ public:
   /** @brief Row length for ELLPACK, equal to number of features when the data is dense. */
   bst_idx_t row_stride;
   /** @brief Starting index of the rows. Used for external memory. */
@@ -45,9 +54,9 @@ struct EllpackDeviceAccessor {
   EllpackDeviceAccessor() = delete;
   EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
                         bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
-                        common::CompressedIterator<uint32_t> gidx_iter, bst_idx_t null_value,
-                        common::Span<FeatureType const> feature_types)
-      : null_value{null_value},
+                        common::CompressedIterator<std::uint32_t> gidx_iter, bst_idx_t null_value,
+                        bool is_dense, common::Span<FeatureType const> feature_types)
+      : null_value_{null_value},
         row_stride{row_stride},
         base_rowid{base_rowid},
         n_rows{n_rows},
@@ -65,8 +74,17 @@ struct EllpackDeviceAccessor {
       feature_segments = cuts->cut_ptrs_.ConstHostPointer();
       min_fvalue = cuts->min_vals_.ConstHostSpan();
     }
+
+    if (is_dense) {
+      static_assert(NullShift() == 63);
+      CHECK(!IsDense());
+      this->null_value_ |= (Ind() << NullShift());
+    }
   }
 
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDense() const {
+    return (this->null_value_ >> NullShift()) != 0;
+  }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDenseCompressed() const {
     return this->row_stride == this->NumFeatures();
   }
@@ -133,7 +151,9 @@ struct EllpackDeviceAccessor {
     }
     return gidx_fvalue_map[gidx];
   }
-  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const { return this->null_value; }
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const {
+    return this->null_value_ & ((Ind() << NullShift()) - Ind());
+  }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumBins() const { return gidx_fvalue_map.size(); }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); }
 };
@@ -224,9 +244,7 @@ class EllpackPageImpl {
   [[nodiscard]] bst_idx_t Size() const;
 
   /** @brief Set the base row id for this page. */
-  void SetBaseRowId(std::size_t row_id) {
-    base_rowid = row_id;
-  }
+  void SetBaseRowId(std::size_t row_id) { base_rowid = row_id; }
 
   [[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
@@ -251,6 +269,12 @@ class EllpackPageImpl {
    */
   [[nodiscard]] auto NumSymbols() const { return this->info.n_symbols; }
   void SetNumSymbols(bst_idx_t n_symbols) { this->info.n_symbols = n_symbols; }
+  /**
+   * @brief Get the value used to represent missing.
+   */
+  [[nodiscard]] bst_idx_t NullValue() const {
+    return this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
+  }
   /**
    * @brief Copy basic shape from another page.
    */
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index d8d6e139c83a..2282ec18a656 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -28,7 +28,7 @@ struct EllpackCacheInfo {
   std::int64_t max_num_device_pages{0};  // Maximum number of pages cached in device.
   float missing{std::numeric_limits<float>::quiet_NaN()};
   std::vector<bst_idx_t> cache_mapping;
-  std::vector<bst_idx_t> buffer_bytes;
+  std::vector<bst_idx_t> buffer_bytes;  // N bytes of the concatenated pages.
   std::vector<bst_idx_t> buffer_rows;
 
   EllpackCacheInfo() = default;
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 710c25a5f088..08ef49a1b393 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -132,13 +132,11 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
   auto g = gpair.GetQuantisedGrad();
   auto h = gpair.GetQuantisedHess();
 
-  atomicAdd(dst_ptr,
-            *reinterpret_cast<uint64_t*>(&g));
-  atomicAdd(dst_ptr + 1,
-            *reinterpret_cast<uint64_t*>(&h));
+  atomicAdd(dst_ptr, *reinterpret_cast<uint64_t*>(&g));
+  atomicAdd(dst_ptr + 1, *reinterpret_cast<uint64_t*>(&h));
 }
 
-template <bool kCompressed, int kBlockThreads, int kItemsPerThread>
+template <bool kCompressed, bool kDense, int kBlockThreads, int kItemsPerThread>
 class HistogramAgent {
   int constexpr static kItemsPerTile = kBlockThreads * kItemsPerThread;
 
@@ -154,6 +152,8 @@ class HistogramAgent {
   const bst_idx_t n_elements_;
   const GradientQuantiser& rounding_;
 
+  static_assert(kCompressed >= kDense);
+
  public:
   __device__ HistogramAgent(GradientPairInt64* smem_arr,
                             GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
@@ -176,7 +176,7 @@ class HistogramAgent {
       Idx ridx = d_ridx_[idx / feature_stride_];
       auto fidx = FeatIdx(group_, idx, feature_stride_);
       bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)];
-      if (compressed_bin != matrix_.NullValue()) {
+      if (kDense || compressed_bin != matrix_.NullValue()) {
         // The matrix is compressed with feature-local bins.
         if (kCompressed) {
           compressed_bin += this->matrix_.feature_segments[fidx];
@@ -211,18 +211,20 @@ class HistogramAgent {
       gpair[i] = d_gpair_[ridx[i]];
       auto fidx = FeatIdx(group_, idx[i], feature_stride_);
       gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)];
-      if (gidx[i] != matrix_.NullValue()) {
-        if (kCompressed) {
+      if (kDense || gidx[i] != matrix_.NullValue()) {
+        if constexpr (kCompressed) {
           gidx[i] += matrix_.feature_segments[fidx];
         }
       } else {
-        gidx[i] = -1;  // missing
+        // Use -1 to denote missing. Since we need to add the beginning bin to gidx, the
+        // result might equal to the `NullValue`.
+        gidx[i] = -1;
       }
     }
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
       // Avoid atomic add if it's a null value.
-      if (gidx[i] != -1) {
+      if (kDense || gidx[i] != -1) {
         auto adjusted = rounding_.ToFixedPoint(gpair[i]);
         AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted);
       }
@@ -262,7 +264,8 @@ class HistogramAgent {
   }
 };
 
-template <bool kIsDense, bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
+template <bool kCompressed, bool kDense, bool use_shared_memory_histograms, int kBlockThreads,
+          int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
     SharedMemHistKernel(const EllpackDeviceAccessor matrix,
                         const FeatureGroupsAccessor feature_groups,
@@ -273,7 +276,7 @@ __global__ void __launch_bounds__(kBlockThreads)
   extern __shared__ char smem[];
   const FeatureGroup group = feature_groups[blockIdx.y];
   auto smem_arr = reinterpret_cast<GradientPairInt64*>(smem);
-  auto agent = HistogramAgent<kIsDense, kBlockThreads, kItemsPerThread>(
+  auto agent = HistogramAgent<kCompressed, kDense, kBlockThreads, kItemsPerThread>(
       smem_arr, d_node_hist, group, matrix, d_ridx, rounding, d_gpair);
   if (use_shared_memory_histograms) {
     agent.BuildHistogramWithShared();
@@ -289,30 +292,41 @@ constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread;
 }  // namespace
 
 // Use auto deduction guide to workaround compiler error.
-template <auto GlobalDense = SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>,
-          auto Global = SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>,
-          auto SharedDense = SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>,
-          auto Shared = SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>>
+template <auto GlobalCompr =
+              SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>,
+          auto Global = SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>,
+          auto SharedCompr = SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>,
+          auto Shared = SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>,
+          auto GlobalDense = SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>,
+          auto SharedDense = SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>>
 struct HistogramKernel {
   enum KernelType : std::size_t {
-    kGlobalDense = 0,
+    kGlobalCompr = 0,
     kGlobal = 1,
-    kSharedDense = 2,
+    kSharedCompr = 2,
     kShared = 3,
+    kGlobalDense = 4,
+    kSharedDense = 5,
   };
   // Kernel for working with dense Ellpack using the global memory.
-  decltype(GlobalDense) global_dense_kernel{
-      SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>};
+  decltype(GlobalCompr) global_compr_kernel{
+      SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the global memory.
-  decltype(Global) global_kernel{SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>};
+  decltype(Global) global_kernel{
+      SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with dense Ellpack using the shared memory.
-  decltype(SharedDense) shared_dense_kernel{
-      SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>};
+  decltype(SharedCompr) shared_compr_kernel{
+      SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the shared memory.
-  decltype(Shared) shared_kernel{SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>};
+  decltype(Shared) shared_kernel{
+      SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>};
+  decltype(GlobalDense) global_dense_kernel{
+      SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>};
+  decltype(SharedDense) shared_dense_kernel{
+      SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>};
 
   bool shared{false};
-  std::array<std::uint32_t, 4> grid_sizes{0, 0, 0, 0};
+  std::array<std::uint32_t, 6> grid_sizes{0, 0, 0, 0, 0, 0};
   std::size_t smem_size{0};
   bool const force_global;
 
@@ -347,9 +361,11 @@ struct HistogramKernel {
       this->grid_sizes[static_cast<std::size_t>(k)] = n_blocks_per_mp * n_mps;
     };
     // Initialize all kernel instantiations
-    std::array kernel_types{kGlobalDense, kGlobal, kSharedDense, kShared};
+    std::array kernel_types{kGlobalCompr, kGlobal,      kSharedCompr,
+                            kShared,      kGlobalDense, kSharedDense};
     std::int32_t k = 0;
-    for (auto& kernel : {global_dense_kernel, global_kernel, shared_dense_kernel, shared_kernel}) {
+    for (auto& kernel : {global_compr_kernel, global_kernel, shared_compr_kernel, shared_kernel,
+                         global_dense_kernel, shared_dense_kernel}) {
       init(kernel, kernel_types[k]);
       ++k;
     }
@@ -397,19 +413,24 @@ class DeviceHistogramBuilderImpl {
     using K = HistogramKernel<>::KernelType;
     if (!this->kernel_->shared) {  // Use global memory
       CHECK_EQ(this->kernel_->smem_size, 0);
-      if (matrix.IsDenseCompressed()) {
-        // Dense must use shared memory except for testing.
+      if (matrix.IsDense()) {
         CHECK(this->kernel_->force_global);
         launcher(this->kernel_->global_dense_kernel, this->kernel_->grid_sizes[K::kGlobalDense]);
+      } else if (matrix.IsDenseCompressed()) {
+        // Dense must use shared memory except for testing.
+        CHECK(this->kernel_->force_global);
+        launcher(this->kernel_->global_compr_kernel, this->kernel_->grid_sizes[K::kGlobalCompr]);
       } else {
         // Sparse
         launcher(this->kernel_->global_kernel, this->kernel_->grid_sizes[K::kGlobal]);
       }
     } else {  // Use shared memory
       CHECK_NE(this->kernel_->smem_size, 0);
-      if (matrix.IsDenseCompressed()) {
-        // Dense
+      if (matrix.IsDense()) {
         launcher(this->kernel_->shared_dense_kernel, this->kernel_->grid_sizes[K::kSharedDense]);
+      } else if (matrix.IsDenseCompressed()) {
+        // Dense
+        launcher(this->kernel_->shared_compr_kernel, this->kernel_->grid_sizes[K::kSharedCompr]);
       } else {
         // Sparse
         launcher(this->kernel_->shared_kernel, this->kernel_->grid_sizes[K::kShared]);
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 85d3008dc55b..a8543e69adcb 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -466,4 +466,32 @@ TEST_P(SparseEllpack, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); }
 TEST_P(SparseEllpack, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); }
 
 INSTANTIATE_TEST_SUITE_P(EllpackPage, SparseEllpack, ::testing::Values(.0f, .2f, .4f, .8f));
+
+TEST(EllpackPage, IsDense) {
+  auto test = [](float sparsity) {
+    auto p_fmat = RandomDataGenerator{64, 16, sparsity}.GenerateDMatrix();
+    auto p = BatchParam{16, tree::TrainParam::DftSparseThreshold()};
+    auto ctx = MakeCUDACtx(0);
+    for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
+      auto d_acc = page.Impl()->GetDeviceAccessor(&ctx);
+      if (sparsity == 0.0) {
+        ASSERT_EQ(d_acc.IsDense(), page.Impl()->IsDense());
+        ASSERT_TRUE(d_acc.IsDense());
+        ASSERT_EQ(p.max_bin, d_acc.NullValue());
+      } else {
+        ASSERT_FALSE(d_acc.IsDense());
+        ASSERT_EQ(p.max_bin * p_fmat->Info().num_col_, d_acc.NullValue());
+      }
+      std::vector<common::CompressedByteT> h_storage;
+      auto h_acc = page.Impl()->GetHostAccessor(&ctx, &h_storage);
+      if (sparsity == 0.0) {
+        ASSERT_TRUE(h_acc.IsDense());
+      } else {
+        ASSERT_FALSE(h_acc.IsDense());
+      }
+    }
+  };
+  test(0.0);
+  test(0.5);
+}
 }  // namespace xgboost

From 69268dff557ebd79629d1a698d3a58059fc7a914 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 9 May 2025 02:30:19 +0800
Subject: [PATCH 046/224] [EM] Drop support for the device cache. (#11446)

---
 doc/tutorials/external_memory.rst             | 11 ++-
 include/xgboost/data.h                        |  5 +-
 .../xgboost4j/java/ExtMemQuantileDMatrix.java | 14 ++--
 .../scala/ExtMemQuantileDMatrix.scala         |  3 +-
 .../scala/spark/GpuXGBoostPlugin.scala        |  3 +-
 .../scala/spark/params/XGBoostParams.scala    |  9 +--
 python-package/xgboost/core.py                | 43 ++++--------
 src/c_api/c_api.cc                            |  9 +--
 src/data/batch_utils.h                        |  2 -
 src/data/data.cc                              |  8 +--
 src/data/ellpack_page_source.cu               | 70 ++-----------------
 src/data/ellpack_page_source.h                | 14 +---
 src/data/extmem_quantile_dmatrix.cu           |  5 +-
 src/data/sparse_page_dmatrix.cu               |  3 +-
 tests/cpp/c_api/test_c_api.cc                 |  8 +--
 .../cpp/data/test_ellpack_page_raw_format.cu  | 32 +++------
 .../cpp/data/test_extmem_quantile_dmatrix.cu  | 64 -----------------
 tests/cpp/data/test_sparse_page_dmatrix.cc    | 16 ++---
 tests/cpp/helpers.cc                          |  4 +-
 tests/cpp/helpers.h                           |  5 --
 20 files changed, 58 insertions(+), 270 deletions(-)

diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 023ebcce7a07..14e0a7f8df59 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -147,9 +147,8 @@ stages the cache on CPU memory by default. Users can change the backing storage
 specifying the ``on_host`` parameter in the :py:class:`~xgboost.DataIter`. However, using
 the disk is not recommended as it's likely to make the GPU slower than the CPU. The option
 is here for experimentation purposes only. In addition,
-:py:class:`~xgboost.ExtMemQuantileDMatrix` parameters ``max_num_device_pages``,
-``min_cache_page_bytes``, and ``max_quantile_batches`` can help control the data placement
-and memory usage.
+:py:class:`~xgboost.ExtMemQuantileDMatrix` parameters ``min_cache_page_bytes``, and
+``max_quantile_batches`` can help control the data placement and memory usage.
 
 Inputs to the :py:class:`~xgboost.ExtMemQuantileDMatrix` (through the iterator) must be on
 the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_memory.py`:
@@ -194,9 +193,9 @@ memory. XGBoost relies on the asynchronous memory pool to reduce the overhead of
 fetching. In addition, the open source `NVIDIA Linux driver
 <https://developer.nvidia.com/blog/nvidia-transitions-fully-towards-open-source-gpu-kernel-modules/>`__
 is required for ``Heterogeneous memory management (HMM)`` support. Usually, users need not
-to change :py:class:`~xgboost.ExtMemQuantileDMatrix` parameters ``max_num_device_pages``
-and ``min_cache_page_bytes``, they are automatically configured based on the device and
-don't change model accuracy. However, the ``max_quantile_batches`` can be useful if
+to change :py:class:`~xgboost.ExtMemQuantileDMatrix` parameters like
+``min_cache_page_bytes``, they are automatically configured based on the device and don't
+change model accuracy. However, the ``max_quantile_batches`` can be useful if
 :py:class:`~xgboost.ExtMemQuantileDMatrix` is running out of device memory during
 construction, see :py:class:`~xgboost.QuantileDMatrix` and the following sections for more
 info. Currently, we focus on devices with ``NVLink-C2C`` support for GPU-based external
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 341184a28d30..33c3a926c96e 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -538,19 +538,16 @@ struct ExtMemConfig {
   std::int64_t min_cache_page_bytes{0};
   // Missing value.
   float missing{std::numeric_limits<float>::quiet_NaN()};
-  // Maximum number of pages cached in device.
-  std::int64_t max_num_device_pages{0};
   // The number of CPU threads.
   std::int32_t n_threads{0};
 
   ExtMemConfig() = default;
   ExtMemConfig(std::string cache, bool on_host, std::int64_t min_cache, float missing,
-               std::int64_t max_num_d, std::int32_t n_threads)
+               std::int32_t n_threads)
       : cache{std::move(cache)},
         on_host{on_host},
         min_cache_page_bytes{min_cache},
         missing{missing},
-        max_num_device_pages{max_num_d},
         n_threads{n_threads} {}
 };
 
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java
index 0ae1a9b30b48..6c9868be8473 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java
@@ -30,7 +30,6 @@ public ExtMemQuantileDMatrix(Iterator<ColumnBatch> iter,
       int maxBin,
       DMatrix ref,
       int nthread,
-      int maxNumDevicePages,
       int maxQuantileBatches,
       int minCachePageBytes) throws XGBoostError {
     long[] out = new long[1];
@@ -39,8 +38,8 @@ public ExtMemQuantileDMatrix(Iterator<ColumnBatch> iter,
       refHandle = new long[1];
       refHandle[0] = ref.getHandle();
     }
-    String conf = this.getConfig(missing, maxBin, nthread, maxNumDevicePages,
-        maxQuantileBatches, minCachePageBytes);
+    String conf = this.getConfig(missing, maxBin, nthread,
+                                 maxQuantileBatches, minCachePageBytes);
     XGBoostJNI.checkCall(XGBoostJNI.XGExtMemQuantileDMatrixCreateFromCallback(
         iter, refHandle, conf, out));
     handle = out[0];
@@ -51,7 +50,7 @@ public ExtMemQuantileDMatrix(
       float missing,
       int maxBin,
       DMatrix ref) throws XGBoostError {
-    this(iter, missing, maxBin, ref, 0, -1, -1, -1);
+    this(iter, missing, maxBin, ref, 0, -1, -1);
   }
 
   public ExtMemQuantileDMatrix(
@@ -61,16 +60,13 @@ public ExtMemQuantileDMatrix(
     this(iter, missing, maxBin, null);
   }
 
-  private String getConfig(float missing, int maxBin, int nthread, int maxNumDevicePages,
-      int maxQuantileBatches, int minCachePageBytes) {
+  private String getConfig(float missing, int maxBin, int nthread,
+                           int maxQuantileBatches, int minCachePageBytes) {
     Map<String, Object> conf = new java.util.HashMap<>();
     conf.put("missing", missing);
     conf.put("max_bin", maxBin);
     conf.put("nthread", nthread);
 
-    if (maxNumDevicePages > 0) {
-      conf.put("max_num_device_pages", maxNumDevicePages);
-    }
     if (maxQuantileBatches > 0) {
       conf.put("max_quantile_batches", maxQuantileBatches);
     }
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala
index d978a1b1fcfd..6c870ad06299 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala
@@ -27,12 +27,11 @@ class ExtMemQuantileDMatrix private[scala](
            maxBin: Int,
            ref: Option[QuantileDMatrix],
            nthread: Int,
-           maxNumDevicePages: Int,
            maxQuantileBatches: Int,
            minCachePageBytes: Int) {
     this(new jExtMemQuantileDMatrix(iter.asJava, missing, maxBin,
       ref.map(_.jDMatrix).orNull,
-      nthread, maxNumDevicePages, maxQuantileBatches, minCachePageBytes))
+      nthread, maxQuantileBatches, minCachePageBytes))
   }
 
   def this(iter: Iterator[ColumnBatch], missing: Float, maxBin: Int) {
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
index ab0e4c6cabf1..01a8842e82b4 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
@@ -134,7 +134,6 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
 
     val maxQuantileBatches = estimator.getMaxQuantileBatches
     val minCachePageBytes = estimator.getMinCachePageBytes
-    val maxNumDevicePages = estimator.getMaxNumDevicePages
 
     /** build QuantileDMatrix on the executor side */
     def buildQuantileDMatrix(input: Iterator[Table],
@@ -143,7 +142,7 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
       extMemPath match {
         case Some(_) =>
           val itr = new ExternalMemoryIterator(input, indices, extMemPath)
-          new ExtMemQuantileDMatrix(itr, missing, maxBin, ref, nthread, maxNumDevicePages,
+          new ExtMemQuantileDMatrix(itr, missing, maxBin, ref, nthread,
             maxQuantileBatches, minCachePageBytes)
 
         case None =>
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
index b6c0a7ae7617..2d94aade5ac6 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
@@ -188,11 +188,6 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
 
   final def getUseExternalMemory: Boolean = $(useExternalMemory)
 
-  final val maxNumDevicePages = new IntParam(this, "maxNumDevicePages", "Maximum number of " +
-    "pages cached in device")
-
-  final def getMaxNumDevicePages: Int = $(maxNumDevicePages)
-
   final val maxQuantileBatches = new IntParam(this, "maxQuantileBatches", "Maximum quantile " +
     "batches")
 
@@ -207,7 +202,7 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
     numEarlyStoppingRounds -> 0, forceRepartition -> false, missing -> Float.NaN,
     featuresCols -> Array.empty, customObj -> null, customEval -> null,
     featureNames -> Array.empty, featureTypes -> Array.empty, useExternalMemory -> false,
-    maxNumDevicePages -> -1, maxQuantileBatches -> -1, minCachePageBytes -> -1)
+    maxQuantileBatches -> -1, minCachePageBytes -> -1)
 
   addNonXGBoostParam(numWorkers, numRound, numEarlyStoppingRounds, inferBatchSize, featuresCol,
     labelCol, baseMarginCol, weightCol, predictionCol, leafPredictionCol, contribPredictionCol,
@@ -251,8 +246,6 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
 
   def setUseExternalMemory(value: Boolean): T = set(useExternalMemory, value).asInstanceOf[T]
 
-  def setMaxNumDevicePages(value: Int): T = set(maxNumDevicePages, value).asInstanceOf[T]
-
   def setMaxQuantileBatches(value: Int): T = set(maxQuantileBatches, value).asInstanceOf[T]
 
   def setMinCachePageBytes(value: Int): T = set(minCachePageBytes, value).asInstanceOf[T]
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 813a0480eca3..7abdac4c1ffb 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1820,7 +1820,6 @@ def __init__(  # pylint: disable=super-init-not-called
         max_bin: Optional[int] = None,
         ref: Optional[DMatrix] = None,
         enable_categorical: bool = False,
-        max_num_device_pages: Optional[int] = None,
         max_quantile_batches: Optional[int] = None,
     ) -> None:
         """
@@ -1829,15 +1828,6 @@ def __init__(  # pylint: disable=super-init-not-called
         data :
             A user-defined :py:class:`DataIter` for loading data.
 
-        max_num_device_pages :
-            For a GPU-based validation dataset, XGBoost can optionally cache some pages
-            in device memory instead of host memory to reduce data transfer. Each cached
-            page has size of `min_cache_page_bytes`. Set this to 0 if you don't want
-            pages to be cached in the device memory. This can be useful for preventing
-            OOM error where there are more than one validation datasets. The default
-            number of device-based page is 1. Lastly, XGBoost infers whether a dataset
-            is used for valdiation by checking whether ref is not None.
-
         max_quantile_batches :
             See :py:class:`QuantileDMatrix`.
 
@@ -1850,7 +1840,6 @@ def __init__(  # pylint: disable=super-init-not-called
             data,
             ref,
             enable_categorical=enable_categorical,
-            max_num_device_pages=max_num_device_pages,
             max_quantile_blocks=max_quantile_batches,
         )
         assert self.handle is not None
@@ -1861,7 +1850,6 @@ def _init(
         ref: Optional[DMatrix],
         *,
         enable_categorical: bool,
-        max_num_device_pages: Optional[int] = None,
         max_quantile_blocks: Optional[int] = None,
     ) -> None:
         args = make_jcargs(
@@ -1871,7 +1859,6 @@ def _init(
             on_host=it.on_host,
             max_bin=self.max_bin,
             min_cache_page_bytes=it.min_cache_page_bytes,
-            max_num_device_pages=max_num_device_pages,
             # It's called blocks internally due to block-based quantile sketching.
             max_quantile_blocks=max_quantile_blocks,
         )
@@ -2559,9 +2546,9 @@ def predict(
             prediction. Note the final column is the bias term.
 
         approx_contribs :
-            Approximate the contributions of each feature.  Used when ``pred_contribs`` or
-            ``pred_interactions`` is set to True.  Changing the default of this parameter
-            (False) is not recommended.
+            Approximate the contributions of each feature.  Used when ``pred_contribs``
+            or ``pred_interactions`` is set to True.  Changing the default of this
+            parameter (False) is not recommended.
 
         pred_interactions :
             When this is True the output will be a matrix of size (nsample,
@@ -2579,10 +2566,10 @@ def predict(
 
         training :
             Whether the prediction value is used for training.  This can effect `dart`
-            booster, which performs dropouts during training iterations but use all trees
-            for inference. If you want to obtain result with dropouts, set this parameter
-            to `True`.  Also, the parameter is set to true when obtaining prediction for
-            custom objective function.
+            booster, which performs dropouts during training iterations but use all
+            trees for inference. If you want to obtain result with dropouts, set this
+            parameter to `True`.  Also, the parameter is set to true when obtaining
+            prediction for custom objective function.
 
             .. versionadded:: 1.0.0
 
@@ -2595,8 +2582,8 @@ def predict(
             .. versionadded:: 1.4.0
 
         strict_shape :
-            When set to True, output shape is invariant to whether classification is used.
-            For both value and margin prediction, the output shape is (n_samples,
+            When set to True, output shape is invariant to whether classification is
+            used.  For both value and margin prediction, the output shape is (n_samples,
             n_groups), n_groups == 1 when multi-class is not used.  Default to False, in
             which case the output shape can be (n_samples, ) if multi-class is not used.
 
@@ -3116,8 +3103,8 @@ def get_fscore(self, fmap: PathLike = "") -> Dict[str, Union[float, List[float]]
 
         .. note:: Zero-importance features will not be included
 
-           Keep in mind that this function does not include zero-importance feature, i.e.
-           those features that have not been used in any split conditions.
+           Keep in mind that this function does not include zero-importance feature,
+           i.e.  those features that have not been used in any split conditions.
 
         Parameters
         ----------
@@ -3141,13 +3128,13 @@ def get_score(
 
         .. note::
 
-           For linear model, only "weight" is defined and it's the normalized coefficients
-           without bias.
+           For linear model, only "weight" is defined and it's the normalized
+           coefficients without bias.
 
         .. note:: Zero-importance features will not be included
 
-           Keep in mind that this function does not include zero-importance feature, i.e.
-           those features that have not been used in any split conditions.
+           Keep in mind that this function does not include zero-importance feature,
+           i.e.  those features that have not been used in any split conditions.
 
         Parameters
         ----------
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f64bd3324406..02f367fceba9 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -330,14 +330,12 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
   xgboost_CHECK_C_ARG_PTR(reset);
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto config = ExtMemConfig{
-      cache, on_host, min_cache_page_bytes, missing, /*max_num_device_pages=*/0, n_threads};
+  auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
   *out = new std::shared_ptr<xgboost::DMatrix>{
       xgboost::DMatrix::Create(iter, proxy, reset, next, config)};
   API_END();
 }
 
-
 namespace {
 std::shared_ptr<DMatrix> GetRefDMatrix(DataIterHandle ref) {
   std::shared_ptr<DMatrix> _ref{nullptr};
@@ -393,8 +391,6 @@ XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
   std::string cache = RequiredArg<String>(jconfig, "cache_prefix", __func__);
   auto min_cache_page_bytes = OptionalArg<Integer, std::int64_t>(jconfig, "min_cache_page_bytes",
                                                                  cuda_impl::AutoCachePageBytes());
-  auto max_num_device_pages = OptionalArg<Integer, std::int64_t>(jconfig, "max_num_device_pages",
-                                                                 cuda_impl::MaxNumDevicePages());
   auto max_quantile_blocks = OptionalArg<Integer, std::int64_t>(
       jconfig, "max_quantile_blocks", std::numeric_limits<std::int64_t>::max());
 
@@ -402,8 +398,7 @@ XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
   xgboost_CHECK_C_ARG_PTR(reset);
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto config =
-      ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, max_num_device_pages, n_threads};
+  auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
   *out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
       iter, proxy, p_ref, reset, next, max_bin, max_quantile_blocks, config)};
   API_END();
diff --git a/src/data/batch_utils.h b/src/data/batch_utils.h
index 8c8a2e7b3a72..7d4422e721d9 100644
--- a/src/data/batch_utils.h
+++ b/src/data/batch_utils.h
@@ -39,8 +39,6 @@ void CheckParam(BatchParam const& init, BatchParam const& param);
 namespace xgboost::cuda_impl {
 // Indicator for XGBoost to not concatenate any page.
 constexpr std::int64_t MatchingPageBytes() { return 0; }
-// Maxmimum number of pages from the validation dataset to be cached in the device memory.
-constexpr std::int32_t MaxNumDevicePages() { return 1; }
 // Default size of the cached page
 constexpr double CachePageRatio() { return 0.125; }
 // Indicator for XGBoost to automatically concatenate pages.
diff --git a/src/data/data.cc b/src/data/data.cc
index 043fc14bd2b5..026bb8eee240 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -950,12 +950,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     CHECK(data_split_mode != DataSplitMode::kCol)
         << "Column-wise data split is not supported for external memory.";
     data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
-    auto config = ExtMemConfig{cache_file,
-                               false,
-                               cuda_impl::MatchingPageBytes(),
-                               std::numeric_limits<float>::quiet_NaN(),
-                               cuda_impl::MaxNumDevicePages(),
-                               1};
+    auto config = ExtMemConfig{cache_file, false, cuda_impl::MatchingPageBytes(),
+                               std::numeric_limits<float>::quiet_NaN(), 1};
     dmat = new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset,
                                        data::fileiter::Next, config};
   }
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index f7cc58a12c1f..f2990e0166b0 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -1,7 +1,6 @@
 /**
  * Copyright 2019-2025, XGBoost contributors
  */
-#include <algorithm>  // for count_if
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int8_t, uint64_t, uint32_t
 #include <memory>     // for shared_ptr, make_unique, make_shared
@@ -21,33 +20,13 @@
 #include "xgboost/base.h"     // for bst_idx_t
 
 namespace xgboost::data {
-namespace {
-[[nodiscard]] bool IsDevicePage(EllpackPageImpl const* page) {
-  switch (page->gidx_buffer.Resource()->Type()) {
-    case common::ResourceHandler::kCudaMalloc:
-    case common::ResourceHandler::kCudaGrowOnly: {
-      return true;
-    }
-    case common::ResourceHandler::kCudaHostCache:
-    case common::ResourceHandler::kCudaMmap:
-    case common::ResourceHandler::kMmap:
-    case common::ResourceHandler::kMalloc:
-      return false;
-  }
-  LOG(FATAL) << "Unreachable";
-  return false;
-}
-}  // anonymous namespace
-
 /**
  * Cache
  */
 EllpackMemCache::EllpackMemCache(EllpackCacheInfo cinfo)
     : cache_mapping{std::move(cinfo.cache_mapping)},
       buffer_bytes{std::move(cinfo.buffer_bytes)},
-      buffer_rows{std::move(cinfo.buffer_rows)},
-      prefer_device{cinfo.prefer_device},
-      max_num_device_pages{cinfo.max_num_device_pages} {
+      buffer_rows{std::move(cinfo.buffer_rows)} {
   CHECK_EQ(buffer_bytes.size(), buffer_rows.size());
 }
 
@@ -63,11 +42,6 @@ EllpackMemCache::~EllpackMemCache() = default;
   return this->pages.at(k).get();
 }
 
-[[nodiscard]] std::int64_t EllpackMemCache::NumDevicePages() const {
-  return std::count_if(this->pages.cbegin(), this->pages.cend(),
-                       [](auto const& page) { return IsDevicePage(page.get()); });
-}
-
 /**
  * Cache stream.
  */
@@ -104,7 +78,6 @@ class EllpackHostCacheStreamImpl {
 
     this->cache_->sizes_orig.push_back(page.Impl()->MemCostBytes());
     auto orig_ptr = this->cache_->sizes_orig.size() - 1;
-    CHECK_EQ(this->cache_->pages.size(), this->cache_->on_device.size());
 
     CHECK_LT(orig_ptr, this->cache_->NumBatchesOrig());
     auto cache_idx = this->cache_->cache_mapping.at(orig_ptr);
@@ -115,17 +88,6 @@ class EllpackHostCacheStreamImpl {
 
     bool const no_concat = this->cache_->NoConcat();
 
-    // Whether the page should be cached in device. If true, then we don't need to make a
-    // copy during write since the temporary page is already in device when page
-    // concatenation is enabled.
-    //
-    // This applies only to a new cached page. If we are concatenating this page to an
-    // existing cached page, then we should respect the existing flag obtained from the
-    // first page of the cached page.
-    bool to_device_if_new_page =
-        this->cache_->prefer_device &&
-        this->cache_->NumDevicePages() < this->cache_->max_num_device_pages;
-
     auto commit_host_page = [](EllpackPageImpl const* old_impl) {
       CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc);
       auto new_impl = std::make_unique<EllpackPageImpl>();
@@ -143,27 +105,19 @@ class EllpackHostCacheStreamImpl {
       auto new_impl = std::make_unique<EllpackPageImpl>();
       new_impl->CopyInfo(page.Impl());
 
-      if (to_device_if_new_page) {
-        // Copy to device
-        new_impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(
-            page.Impl()->gidx_buffer.size());
-      } else {
-        // Copy to host
-        new_impl->gidx_buffer = common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(
-            page.Impl()->gidx_buffer.size());
-      }
+      // Copy to host
+      new_impl->gidx_buffer = common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(
+          page.Impl()->gidx_buffer.size());
       dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), page.Impl()->gidx_buffer.data(),
                                     page.Impl()->gidx_buffer.size_bytes(), cudaMemcpyDefault));
 
       this->cache_->offsets.push_back(new_impl->n_rows * new_impl->info.row_stride);
       this->cache_->pages.push_back(std::move(new_impl));
-      this->cache_->on_device.push_back(to_device_if_new_page);
       return new_page;
     }
 
     if (new_page) {
-      // No need to copy if it's already in device.
-      if (!this->cache_->pages.empty() && !this->cache_->on_device.back()) {
+      if (!this->cache_->pages.empty()) {
         // Need to wrap up the previous page.
         auto commited = commit_host_page(this->cache_->pages.back().get());
         // Replace the previous page (on device) with a new page on host.
@@ -183,7 +137,6 @@ class EllpackHostCacheStreamImpl {
       this->cache_->offsets.push_back(offset);
 
       this->cache_->pages.push_back(std::move(new_impl));
-      this->cache_->on_device.push_back(to_device_if_new_page);
     } else {
       CHECK(!this->cache_->pages.empty());
       CHECK_EQ(cache_idx, this->cache_->pages.size() - 1);
@@ -192,8 +145,7 @@ class EllpackHostCacheStreamImpl {
       this->cache_->offsets.back() += offset;
     }
 
-    // No need to copy if it's already in device.
-    if (last_page && !this->cache_->on_device.back()) {
+    if (last_page) {
       auto commited = commit_host_page(this->cache_->pages.back().get());
       this->cache_->pages.back() = std::move(commited);
     }
@@ -203,10 +155,6 @@ class EllpackHostCacheStreamImpl {
 
   void Read(EllpackPage* out, bool prefetch_copy) const {
     auto page = this->cache_->At(this->ptr_);
-    if (IsDevicePage(page)) {
-      // Page is already in the device memory, no need to copy.
-      prefetch_copy = false;
-    }
     auto out_impl = out->Impl();
     if (prefetch_copy) {
       out_impl->gidx_buffer =
@@ -339,12 +287,6 @@ void CalcCacheMapping(Context const* ctx, bool is_dense,
   cinfo->cache_mapping = std::move(cache_mapping);
   cinfo->buffer_bytes = std::move(cache_bytes);
   cinfo->buffer_rows = std::move(cache_rows);
-
-  // Directly store in device if there's only one batch.
-  if (cinfo->NumBatchesCc() == 1) {
-    cinfo->prefer_device = true;
-    LOG(INFO) << "Prefer device cache as there's only 1 page.";
-  }
 }
 
 /**
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 2282ec18a656..caa5963d85a8 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -24,20 +24,13 @@
 namespace xgboost::data {
 struct EllpackCacheInfo {
   BatchParam param;
-  bool prefer_device{false};  // Prefer to cache the page in the device memory instead of host.
-  std::int64_t max_num_device_pages{0};  // Maximum number of pages cached in device.
   float missing{std::numeric_limits<float>::quiet_NaN()};
   std::vector<bst_idx_t> cache_mapping;
   std::vector<bst_idx_t> buffer_bytes;  // N bytes of the concatenated pages.
   std::vector<bst_idx_t> buffer_rows;
 
   EllpackCacheInfo() = default;
-  EllpackCacheInfo(BatchParam param, bool prefer_device, std::int64_t max_num_device_pages,
-                   float missing)
-      : param{std::move(param)},
-        prefer_device{prefer_device},
-        max_num_device_pages{max_num_device_pages},
-        missing{missing} {}
+  EllpackCacheInfo(BatchParam param, float missing) : param{std::move(param)}, missing{missing} {}
 
   // Only effective for host-based cache.
   // The number of batches for the concatenated cache.
@@ -51,7 +44,6 @@ struct EllpackCacheInfo {
 // This is a memory-based cache. It can be a mixed of the device memory and the host memory.
 struct EllpackMemCache {
   std::vector<std::unique_ptr<EllpackPageImpl>> pages;
-  std::vector<bool> on_device;
   std::vector<std::size_t> offsets;
   // Size of each batch before concatenation.
   std::vector<bst_idx_t> sizes_orig;
@@ -60,8 +52,6 @@ struct EllpackMemCache {
   // Cache info
   std::vector<std::size_t> const buffer_bytes;
   std::vector<bst_idx_t> const buffer_rows;
-  bool const prefer_device;
-  std::int64_t const max_num_device_pages;
 
   explicit EllpackMemCache(EllpackCacheInfo cinfo);
   ~EllpackMemCache();
@@ -76,8 +66,6 @@ struct EllpackMemCache {
 
   [[nodiscard]] bst_idx_t NumBatchesOrig() const { return cache_mapping.size(); }
   [[nodiscard]] EllpackPageImpl const* At(std::int32_t k) const;
-
-  [[nodiscard]] std::int64_t NumDevicePages() const;
 };
 
 // Pimpl to hide CUDA calls from the host compiler.
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
index 5959be040b95..44aa3346e17c 100644
--- a/src/data/extmem_quantile_dmatrix.cu
+++ b/src/data/extmem_quantile_dmatrix.cu
@@ -58,10 +58,7 @@ void ExtMemQuantileDMatrix::InitFromCUDA(
   /**
    * Calculate cache info
    */
-  // Prefer device storage for validation dataset since we can't hide the data loading
-  // overhead with inference. On the other hand, training procedures can confortably
-  // overlap with the data transfer.
-  auto cinfo = EllpackCacheInfo{p, (ref != nullptr), config.max_num_device_pages, config.missing};
+  auto cinfo = EllpackCacheInfo{p, config.missing};
   CalcCacheMapping(ctx, this->info_.IsDense(), cuts,
                    DftMinCachePageBytes(config.min_cache_page_bytes), ext_info, &cinfo);
   CHECK_EQ(cinfo.cache_mapping.size(), ext_info.n_batches);
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index d6a7e1ac112d..f22e5136d558 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -53,8 +53,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
       ellpack_page_source_.emplace<EllpackHostPtr>(nullptr);
     }
 
-    auto cinfo = EllpackCacheInfo{param, /*prefer_device=*/false, /*max_num_device_pages=*/0,
-                                  this->missing_};
+    auto cinfo = EllpackCacheInfo{param, this->missing_};
     CalcCacheMapping(ctx, this->IsDense(), cuts, min_cache_page_bytes_, this->ext_info_, &cinfo);
     CHECK_EQ(cinfo.cache_mapping.size(), this->ext_info_.n_batches)
         << "Page concatenation is only supported by the `ExtMemQuantileDMatrix`.";
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index e38b36f14450..9f298e548050 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -470,12 +470,8 @@ auto MakeExtMemForTest(bst_idx_t n_samples, bst_feature_t n_features, Json dconf
            0);
 
   NumpyArrayIterForTest iter_1{0.0f, n_samples, n_features, n_batches};
-  auto config = ExtMemConfig{"",
-                             false,
-                             cuda_impl::MatchingPageBytes(),
-                             std::numeric_limits<float>::quiet_NaN(),
-                             cuda_impl::MaxNumDevicePages(),
-                             0};
+  auto config = ExtMemConfig{"", false, cuda_impl::MatchingPageBytes(),
+                             std::numeric_limits<float>::quiet_NaN(), 0};
   auto Xy = std::make_shared<data::SparsePageDMatrix>(&iter_1, iter_1.Proxy(), Reset, Next, config);
   MakeLabelForTest(Xy, p_fmat);
   return std::pair{p_fmat, Xy};
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 216736e05f55..4acf4b6c4514 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
@@ -16,7 +16,7 @@ namespace {
 [[nodiscard]] EllpackCacheInfo CInfoForTest(Context const *ctx, DMatrix *Xy, bst_idx_t row_stride,
                                             BatchParam param,
                                             std::shared_ptr<common::HistogramCuts const> cuts) {
-  EllpackCacheInfo cinfo{param, false, 1, std::numeric_limits<float>::quiet_NaN()};
+  EllpackCacheInfo cinfo{param, std::numeric_limits<float>::quiet_NaN()};
   ExternalDataInfo ext_info;
   ext_info.n_batches = 1;
   ext_info.row_stride = row_stride;
@@ -24,10 +24,6 @@ namespace {
 
   CalcCacheMapping(ctx, Xy->IsDense(), cuts, 0, ext_info, &cinfo);
   CHECK_EQ(ext_info.n_batches, cinfo.cache_mapping.size());
-  if (cinfo.NumBatchesCc() == 1) {
-    EXPECT_TRUE(cinfo.prefer_device);
-    cinfo.prefer_device = false;  // We test the host cache.
-  }
   return cinfo;
 }
 
@@ -120,7 +116,7 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
       for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
         if (!format) {
           // Prepare the mapping info.
-          EllpackCacheInfo cinfo{param, false, 1, std::numeric_limits<float>::quiet_NaN()};
+          EllpackCacheInfo cinfo{param, std::numeric_limits<float>::quiet_NaN()};
           for (std::size_t i = 0; i < 3; ++i) {
             cinfo.cache_mapping.push_back(i);
             cinfo.buffer_bytes.push_back(page.Impl()->MemCostBytes());
@@ -167,9 +163,8 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   bst_idx_t n_features = 16, n_samples = 128;
 
-  auto test = [&](std::int32_t max_num_device_pages, std::int64_t min_cache_page_bytes) {
-    EllpackCacheInfo cinfo{param, true, max_num_device_pages,
-                           std::numeric_limits<float>::quiet_NaN()};
+  auto test = [&](std::int64_t min_cache_page_bytes) {
+    EllpackCacheInfo cinfo{param, std::numeric_limits<float>::quiet_NaN()};
     ExternalDataInfo ext_info;
 
     ext_info.n_batches = 8;
@@ -188,9 +183,7 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
     for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
       auto cuts = page.Impl()->CutsShared();
       CalcCacheMapping(&ctx, true, cuts, min_cache_page_bytes, ext_info, &cinfo);
-      [&] {
-        ASSERT_EQ(cinfo.buffer_rows.size(), 4ul);
-      }();
+      EXPECT_EQ(cinfo.buffer_rows.size(), 4ul);
       policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
     }
 
@@ -209,17 +202,8 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
   };
 
   {
-    auto mem_cache = test(1, n_features * n_samples);
-    ASSERT_EQ(mem_cache->on_device.size(), 4);
-    ASSERT_TRUE(mem_cache->on_device[0]);
-    ASSERT_EQ(mem_cache->NumDevicePages(), 1);
-  }
-  {
-    auto mem_cache = test(2, n_features * n_samples);
-    ASSERT_EQ(mem_cache->on_device.size(), 4);
-    ASSERT_TRUE(mem_cache->on_device[0]);
-    ASSERT_TRUE(mem_cache->on_device[1]);
-    ASSERT_EQ(mem_cache->NumDevicePages(), 2);
+    auto mem_cache = test(n_features * n_samples);
+    ASSERT_EQ(mem_cache->pages.size(), 4);
   }
 }
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index e781acebc00e..6d2b897a250d 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -113,68 +113,4 @@ TEST_P(EllpackHostCacheTest, Basic) {
 INSTANTIATE_TEST_SUITE_P(ExtMemQuantileDMatrix, EllpackHostCacheTest,
                          ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.4f, 0.8f),
                                             ::testing::Bool()));
-
-class EllpackDeviceCacheTest : public ::testing::TestWithParam<float> {
- public:
-  void Run() {
-    auto sparsity = this->GetParam();
-    auto ctx = MakeCUDACtx(0);
-    bst_idx_t n_samples = 2048, n_features = 16;
-    bst_bin_t n_bins = 32;
-    auto p = BatchParam{n_bins, tree::TrainParam::DftSparseThreshold()};
-    auto p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
-                      .Batches(4)
-                      .Device(ctx.Device())
-                      .Bins(p.max_bin)
-                      .OnHost(true)
-                      .MinPageCacheBytes(0)
-                      .GenerateExtMemQuantileDMatrix("temp", true);
-
-    auto p_fmat_valid_d = RandomDataGenerator{n_samples, n_features, sparsity}
-                              .Batches(4)
-                              .Device(ctx.Device())
-                              .Bins(p.max_bin)
-                              .OnHost(true)
-                              .Ref(p_fmat)
-                              .MinPageCacheBytes(0)
-                              .MaxNumDevicePages(4)
-                              .GenerateExtMemQuantileDMatrix("temp", true);
-    ASSERT_EQ(p_fmat_valid_d->NumBatches(), 4);
-    auto p_fmat_valid_h = RandomDataGenerator{n_samples, n_features, sparsity}
-                              .Batches(4)
-                              .Device(ctx.Device())
-                              .Bins(p.max_bin)
-                              .OnHost(true)
-                              .Ref(p_fmat)
-                              .MinPageCacheBytes(0)
-                              .MaxNumDevicePages(0)
-                              .GenerateExtMemQuantileDMatrix("temp", true);
-    ASSERT_EQ(p_fmat_valid_h->NumBatches(), 4);
-
-    auto d_it = p_fmat_valid_d->GetBatches<EllpackPage>(&ctx, p).begin();
-    std::vector<std::shared_ptr<EllpackPage const>> d_pages;
-    auto h_it = p_fmat_valid_h->GetBatches<EllpackPage>(&ctx, p).begin();
-    std::vector<std::shared_ptr<EllpackPage const>> h_pages;
-    for (; !d_it.AtEnd(); ++d_it) {
-      d_pages.push_back(d_it.Page());
-    }
-    for (; !h_it.AtEnd(); ++h_it) {
-      h_pages.push_back(h_it.Page());
-    }
-    ASSERT_EQ(h_pages.size(), d_pages.size());
-    for (std::size_t i = 0; i < h_pages.size(); ++i) {
-      if (sparsity != 0.0) {
-        ASSERT_LT(d_pages[i]->Impl()->info.row_stride, p_fmat_valid_d->Info().num_col_);
-      } else {
-        ASSERT_EQ(d_pages[i]->Impl()->info.row_stride, p_fmat_valid_d->Info().num_col_);
-      }
-      AssertEllpackEq(&ctx, h_pages[i]->Impl(), d_pages[i]->Impl());
-    }
-  }
-};
-
-TEST_P(EllpackDeviceCacheTest, Basic) { this->Run(); }
-
-INSTANTIATE_TEST_SUITE_P(ExtMemQuantileDMatrix, EllpackDeviceCacheTest,
-                         ::testing::Values(0.0f, 0.8f));
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 96fd2482f419..f69f8b918733 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2024, XGBoost Contributors
+ * Copyright 2016-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
@@ -32,10 +32,8 @@ void TestSparseDMatrixLoadFile(Context const* ctx) {
   opath += "?indexing_mode=1&format=libsvm";
   data::FileIterator iter{opath, 0, 1};
   auto n_threads = 0;
-  auto config =
-      ExtMemConfig{tmpdir.path + "cache",          false,
-                   cuda_impl::MatchingPageBytes(), std::numeric_limits<float>::quiet_NaN(),
-                   cuda_impl::MaxNumDevicePages(), n_threads};
+  auto config = ExtMemConfig{tmpdir.path + "cache", false, cuda_impl::MatchingPageBytes(),
+                             std::numeric_limits<float>::quiet_NaN(), n_threads};
   data::SparsePageDMatrix m{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
                             config};
   ASSERT_EQ(AllThreadsForTest(), m.Ctx()->Threads());
@@ -364,12 +362,8 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
   CreateBigTestData(filename, 1 << 16);
 
   data::FileIterator iter(filename + "?format=libsvm", 0, 1);
-  auto config = ExtMemConfig{filename,
-                             false,
-                             cuda_impl::MatchingPageBytes(),
-                             std::numeric_limits<float>::quiet_NaN(),
-                             cuda_impl::MaxNumDevicePages(),
-                             threads};
+  auto config = ExtMemConfig{filename, false, cuda_impl::MatchingPageBytes(),
+                             std::numeric_limits<float>::quiet_NaN(), threads};
   std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
       &iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next, config}};
   CHECK(sparse->Ctx()->Threads() == threads || sparse->Ctx()->Threads() == AllThreadsForTest());
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 7e2e79ba51f6..5e8ce1432edb 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2024, XGBoost contributors
+ * Copyright 2016-2025, XGBoost contributors
  */
 #include "helpers.h"
 
@@ -453,7 +453,6 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
       this->on_host_,
       this->min_cache_page_bytes_,
       std::numeric_limits<float>::quiet_NaN(),
-      this->max_num_device_pages_,
       Context{}.Threads(),
   };
   std::shared_ptr<DMatrix> p_fmat{
@@ -503,7 +502,6 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
       this->on_host_,
       this->min_cache_page_bytes_,
       std::numeric_limits<float>::quiet_NaN(),
-      this->max_num_device_pages_,
       Context{}.Threads(),
   };
   std::shared_ptr<DMatrix> p_fmat{
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 024a8531f3a7..ef3920a20df7 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -241,7 +241,6 @@ class RandomDataGenerator {
   bool on_host_{false};
   std::shared_ptr<DMatrix> ref_{nullptr};
   std::int64_t min_cache_page_bytes_{0};
-  std::int64_t max_num_device_pages_{1};
 
   Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;
 
@@ -279,10 +278,6 @@ class RandomDataGenerator {
     this->min_cache_page_bytes_ = min_cache_page_bytes;
     return *this;
   }
-  RandomDataGenerator& MaxNumDevicePages(std::int64_t max_num_device_pages) {
-    this->max_num_device_pages_ = max_num_device_pages;
-    return *this;
-  }
   RandomDataGenerator& Seed(uint64_t s) {
     seed_ = s;
     lcg_.Seed(seed_);

From 2835e496452e628283a59302453529613167d075 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 12 May 2025 15:33:48 -0700
Subject: [PATCH 047/224] [CI] Rename container xgb-ci.aarch64 ->
 xgb-ci.manylinux_2_28_x86_64 (#11454)

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 06d3d12e2d48..ddd052ef872a 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -269,7 +269,7 @@ jobs:
             runner: linux-amd64-cpu
             artifact_from: build-cuda
           - description: cpu-arm64
-            image_repo: xgb-ci.aarch64
+            image_repo: xgb-ci.manylinux_2_28_aarch64
             suite: cpu-arm64
             runner: linux-arm64-cpu
             artifact_from: build-python-wheels-arm64

From bc19741748ce61026d4f93cf183ffadc5dae6706 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 14 May 2025 01:46:16 +0800
Subject: [PATCH 048/224] Add wrapper for CUDA host pinned memory pool.
 (#11451)

---
 src/common/cuda_pinned_allocator.cu        | 73 ++++++++++++++++++++++
 src/common/cuda_pinned_allocator.h         | 32 +++++++++-
 src/common/cuda_rt_utils.cc                | 17 ++++-
 src/common/cuda_rt_utils.h                 |  3 +
 src/common/io.h                            | 15 +++--
 src/common/ref_resource_view.cuh           | 12 +++-
 src/common/resource.cuh                    | 30 ++++++++-
 tests/cpp/common/test_ref_resource_view.cu | 47 +++++++++++++-
 8 files changed, 215 insertions(+), 14 deletions(-)
 create mode 100644 src/common/cuda_pinned_allocator.cu

diff --git a/src/common/cuda_pinned_allocator.cu b/src/common/cuda_pinned_allocator.cu
new file mode 100644
index 000000000000..e67dec64ae3c
--- /dev/null
+++ b/src/common/cuda_pinned_allocator.cu
@@ -0,0 +1,73 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include "cuda_pinned_allocator.h"
+
+#if defined(XGBOOST_USE_CUDA)
+
+#include <cuda_runtime_api.h>  // for cudaMemPoolCreate, cudaMemPoolDestroy
+
+#include <array>    // for array
+#include <cstring>  // for memset
+#include <memory>   // for unique_ptr
+
+#endif  // defined(XGBOOST_USE_CUDA)
+
+#include "common.h"
+#include "cuda_rt_utils.h"  // for CurrentDevice
+
+#if CUDART_VERSION >= 12080
+#define CUDA_HW_DECOM_AVAILABLE 1
+#endif
+
+namespace xgboost::common::cuda_impl {
+[[nodiscard]] MemPoolHdl CreateHostMemPool() {
+  auto mem_pool = std::unique_ptr<cudaMemPool_t, void (*)(cudaMemPool_t*)>{
+      [] {
+        cudaMemPoolProps h_props;
+        std::memset(&h_props, '\0', sizeof(h_props));
+        auto numa_id = curt::GetNumaId();
+        h_props.location.id = numa_id;
+        h_props.location.type = cudaMemLocationTypeHostNuma;
+        h_props.allocType = cudaMemAllocationTypePinned;
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+        h_props.usage = cudaMemPoolCreateUsageHwDecompress;
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+        h_props.handleTypes = cudaMemHandleTypeNone;
+
+        cudaMemPoolProps d_props;
+        std::memset(&d_props, '\0', sizeof(d_props));
+        auto device_idx = curt::CurrentDevice();
+        d_props.location.id = device_idx;
+        d_props.location.type = cudaMemLocationTypeDevice;
+        d_props.allocType = cudaMemAllocationTypePinned;
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+        d_props.usage = cudaMemPoolCreateUsageHwDecompress;
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+        d_props.handleTypes = cudaMemHandleTypeNone;
+
+        std::array<cudaMemPoolProps, 2> vprops{h_props, d_props};
+
+        cudaMemPool_t* mem_pool = new cudaMemPool_t;
+        dh::safe_cuda(cudaMemPoolCreate(mem_pool, vprops.data()));
+
+        cudaMemAccessDesc h_desc;
+        h_desc.location = h_props.location;
+        h_desc.flags = cudaMemAccessFlagsProtReadWrite;
+
+        cudaMemAccessDesc d_desc;
+        d_desc.location = d_props.location;
+        d_desc.flags = cudaMemAccessFlagsProtReadWrite;
+
+        std::array<cudaMemAccessDesc, 2> descs{h_desc, d_desc};
+        dh::safe_cuda(cudaMemPoolSetAccess(*mem_pool, descs.data(), descs.size()));
+        return mem_pool;
+      }(),
+      [](cudaMemPool_t* mem_pool) {
+        if (mem_pool) {
+          dh::safe_cuda(cudaMemPoolDestroy(*mem_pool));
+        }
+      }};
+  return mem_pool;
+}
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
index 4d7fa315845a..04549d54d3e4 100644
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -1,15 +1,15 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  *
  * @brief cuda pinned allocator for usage with thrust containers
  */
-
 #pragma once
 
 #include <cuda_runtime.h>
 
 #include <cstddef>  // for size_t
 #include <limits>   // for numeric_limits
+#include <memory>   // for unique_ptr
 #include <new>      // for bad_array_new_length
 
 #include "common.h"
@@ -103,6 +103,34 @@ struct SamAllocPolicy {
   }
 };
 
+/**
+ * @brief A RAII handle type to the CUDA memory pool.
+ */
+using MemPoolHdl = std::unique_ptr<cudaMemPool_t, void (*)(cudaMemPool_t*)>;
+
+/**
+ * @brief Create a CUDA memory pool for allocating host pinned memory.
+ */
+[[nodiscard]] MemPoolHdl CreateHostMemPool();
+
+/**
+ * @brief C++ wrapper for the CUDA memory pool.
+ */
+class HostPinnedMemPool {
+  MemPoolHdl pool_;
+
+ public:
+  HostPinnedMemPool() : pool_{CreateHostMemPool()} {}
+  void* AllocateAsync(std::size_t n_bytes, cudaStream_t stream) {
+    void* ptr = nullptr;
+    dh::safe_cuda(cudaMallocFromPoolAsync(&ptr, n_bytes, *this->pool_, stream));
+    return ptr;
+  }
+  void DeallocateAsync(void* ptr, cudaStream_t stream) {
+    dh::safe_cuda(cudaFreeAsync(ptr, stream));
+  }
+};
+
 template <typename T, template <typename> typename Policy>
 class CudaHostAllocatorImpl : public Policy<T> {
  public:
diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
index b6014385d6c0..dd060d52fded 100644
--- a/src/common/cuda_rt_utils.cc
+++ b/src/common/cuda_rt_utils.cc
@@ -5,7 +5,9 @@
 
 #if defined(XGBOOST_USE_CUDA)
 #include <cuda_runtime_api.h>
-#endif  // defined(XGBOOST_USE_CUDA)
+
+#include <algorithm>  // for max
+#endif                // defined(XGBOOST_USE_CUDA)
 
 #include <cstddef>  // for size_t
 #include <cstdint>  // for int32_t
@@ -102,6 +104,13 @@ void DrVersion(std::int32_t* major, std::int32_t* minor) {
   GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
 }
 
+[[nodiscard]] std::int32_t GetNumaId() {
+  std::int32_t numa_id = -1;
+  dh::safe_cuda(cudaDeviceGetAttribute(&numa_id, cudaDevAttrNumaId, curt::CurrentDevice()));
+  numa_id = std::max(numa_id, 0);
+  return numa_id;
+}
+
 #else
 std::int32_t AllVisibleGPUs() { return 0; }
 
@@ -125,5 +134,11 @@ void SetDevice(std::int32_t device) {
     common::AssertGPUSupport();
   }
 }
+
+[[nodiscard]] std::int32_t GetNumaId() {
+  common::AssertGPUSupport();
+  return 0;
+}
+
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost::curt
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
index ad029bbeee6c..9d6fa3d61db1 100644
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@@ -34,4 +34,7 @@ void RtVersion(std::int32_t* major, std::int32_t* minor);
 
 // Returns the latest version of CUDA supported by the driver.
 void DrVersion(std::int32_t* major, std::int32_t* minor);
+
+// Get the current device's numa ID.
+[[nodiscard]] std::int32_t GetNumaId();
 }  // namespace xgboost::curt
diff --git a/src/common/io.h b/src/common/io.h
index 0bca0cb8cac6..6fdbf643a1ca 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -282,12 +282,13 @@ class ResourceHandler {
  public:
   // RTTI
   enum Kind : std::uint8_t {
-    kMalloc = 0,         // System memory.
-    kMmap = 1,           // Memory mapp.
-    kCudaMalloc = 2,     // CUDA device memory.
-    kCudaMmap = 3,       // CUDA with mmap.
-    kCudaHostCache = 4,  // CUDA pinned host memory.
-    kCudaGrowOnly = 5,   // CUDA virtual memory allocator.
+    kMalloc = 0,             // System memory.
+    kMmap = 1,               // Memory mapp.
+    kCudaMalloc = 2,         // CUDA device memory.
+    kCudaMmap = 3,           // CUDA with mmap.
+    kCudaHostCache = 4,      // CUDA pinned host memory.
+    kCudaGrowOnly = 5,       // CUDA virtual memory allocator.
+    kCudaPinnedMemPool = 6,  // CUDA memory pool for pinned host memory.
   };
 
  private:
@@ -316,6 +317,8 @@ class ResourceHandler {
         return "CudaHostCache";
       case kCudaGrowOnly:
         return "CudaGrowOnly";
+      case kCudaPinnedMemPool:
+        return "CudaPinnedMemPool";
     }
     LOG(FATAL) << "Unreachable.";
     return {};
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
index 21d49333b579..1e338124cc67 100644
--- a/src/common/ref_resource_view.cuh
+++ b/src/common/ref_resource_view.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #pragma once
 
@@ -43,4 +43,14 @@ template <typename T>
   auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
   return ref;
 }
+
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithPinnedMemPool(
+    std::shared_ptr<cuda_impl::HostPinnedMemPool> pool, std::size_t n_elements,
+    dh::CUDAStreamView stream) {
+  auto resource = std::make_shared<common::HostPinnedMemPoolResource>(
+      std::move(pool), n_elements * sizeof(T), stream);
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  return ref;
+}
 }  // namespace xgboost::common
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
index 4936cb798a07..9789c0bc9cf4 100644
--- a/src/common/resource.cuh
+++ b/src/common/resource.cuh
@@ -1,11 +1,13 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #pragma once
 #include <cstddef>     // for size_t
 #include <functional>  // for function
+#include <utility>     // for move
 
-#include "cuda_pinned_allocator.h"  // for SamAllocator
+#include "cuda_pinned_allocator.h"  // for SamAllocator, HostPinnedMemPool
+#include "device_helpers.cuh"       // for CUDAStreamView
 #include "device_vector.cuh"        // for DeviceUVector, GrowOnlyVirtualMemVec
 #include "io.h"                     // for ResourceHandler, MMAPFile
 #include "xgboost/string_view.h"    // for StringView
@@ -75,6 +77,30 @@ class CudaPinnedResource : public ResourceHandler {
   void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
 };
 
+/**
+ * @brief Resource for fixed-size memory allocated by @ref HostPinnedMemPool.
+ *
+ * This container shares the pool but owns the memory.
+ */
+class HostPinnedMemPoolResource : public ResourceHandler {
+  std::shared_ptr<cuda_impl::HostPinnedMemPool> pool_;
+  std::size_t n_bytes_;
+  dh::CUDAStreamView stream_;
+  void* ptr_;
+
+ public:
+  explicit HostPinnedMemPoolResource(std::shared_ptr<cuda_impl::HostPinnedMemPool> pool,
+                                     std::size_t n_bytes, dh::CUDAStreamView stream)
+      : ResourceHandler{kCudaPinnedMemPool},
+        pool_{std::move(pool)},
+        n_bytes_{n_bytes},
+        stream_{stream},
+        ptr_{this->pool_->AllocateAsync(n_bytes, stream)} {}
+  ~HostPinnedMemPoolResource() override { this->pool_->DeallocateAsync(this->ptr_, this->stream_); }
+  [[nodiscard]] std::size_t Size() const override { return this->n_bytes_; }
+  [[nodiscard]] void* Data() override { return this->ptr_; }
+};
+
 class CudaMmapResource : public ResourceHandler {
   std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
   std::size_t n_;
diff --git a/tests/cpp/common/test_ref_resource_view.cu b/tests/cpp/common/test_ref_resource_view.cu
index ed69d087dc3c..169e26648309 100644
--- a/tests/cpp/common/test_ref_resource_view.cu
+++ b/tests/cpp/common/test_ref_resource_view.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #if defined(__linux__)
 
@@ -10,7 +10,8 @@
 #include <thrust/sequence.h>                    // for sequence
 
 #include "../../../src/common/ref_resource_view.cuh"
-#include "../helpers.h"  // for MakeCUDACtx
+#include "../../../src/common/threadpool.h"  // for ThreadPool
+#include "../helpers.h"                      // for MakeCUDACtx
 
 namespace xgboost::common {
 class TestCudaGrowOnly : public ::testing::TestWithParam<std::size_t> {
@@ -44,6 +45,48 @@ class TestCudaGrowOnly : public ::testing::TestWithParam<std::size_t> {
 TEST_P(TestCudaGrowOnly, Resize) { this->Run(this->GetParam()); }
 
 INSTANTIATE_TEST_SUITE_P(RefResourceView, TestCudaGrowOnly, ::testing::Values(1 << 20, 1 << 21));
+
+TEST(HostPinnedMemPool, Alloc) {
+  std::vector<RefResourceView<double>> refs;
+
+  {
+    // pool goes out of scope before refs does. Test memory safety.
+    auto pool = std::make_shared<cuda_impl::HostPinnedMemPool>();
+    for (std::size_t i = 0; i < 4; ++i) {
+      auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, dh::DefaultStream());
+      refs.emplace_back(std::move(ref));
+    }
+    for (std::size_t i = 0; i < 4; ++i) {
+      auto const& ref = refs[i];
+      ASSERT_EQ(ref.size(), 128 + i);
+      ASSERT_EQ(ref.size_bytes(), ref.size() * sizeof(double));
+    }
+
+    // Thread safety.
+    auto n_threads = static_cast<std::int32_t>(std::thread::hardware_concurrency());
+    common::ThreadPool workers{"tmempool", n_threads, [] {
+                               }};
+    std::vector<std::future<RefResourceView<double>>> alloc_futs;
+    for (std::int32_t i = 0, n = n_threads * 4; i < n; ++i) {
+      auto fut = workers.Submit([i, pool] {
+        auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, dh::DefaultStream());
+        return ref;
+      });
+      alloc_futs.emplace_back(std::move(fut));
+    }
+    std::vector<std::future<void>> free_futs(alloc_futs.size());
+    for (std::int32_t i = 0, n = n_threads * 4; i < n; ++i) {
+      auto fut = workers.Submit([i, pool, &alloc_futs, &free_futs] {
+        auto ref = alloc_futs[i].get();
+        ASSERT_EQ(ref.size(), 128 + i);
+      });
+      free_futs[i] = std::move(fut);
+    }
+    for (std::int32_t i = 0, n = n_threads * 4; i < n; ++i) {
+      free_futs[i].get();
+    }
+  }
+}
 }  // namespace xgboost::common
 
 #endif  // defined(__linux__)

From f786d37f6d8cfc9c15b3b575b50cc25d83a3744e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 14 May 2025 02:41:46 +0800
Subject: [PATCH 049/224] [EM] Initial support for splitting up the host cache.
 (#11453)

- Add a new parameter to specify the portion of the cache.
- Split the host cache into host + device cache.

Limitations:
- Direct access to the cache page has not yet been implemented.
- More work is needed to find an optimal split policy. For now, auto means host only.
---
 doc/tutorials/external_memory.rst             |   2 +-
 include/xgboost/c_api.h                       |   4 +
 include/xgboost/data.h                        |  15 +-
 python-package/xgboost/core.py                |  15 ++
 src/c_api/c_api.cc                            |  10 +-
 src/common/error_msg.h                        |   9 +-
 src/common/nvtx_utils.h                       |  11 +
 src/common/ref_resource_view.cuh              |   3 +
 src/common/ref_resource_view.h                |   2 +-
 src/data/batch_utils.cc                       |  18 +-
 src/data/batch_utils.h                        |  14 ++
 src/data/data.cc                              |   8 +-
 src/data/ellpack_page_raw_format.cu           |   7 +-
 src/data/ellpack_page_source.cu               | 190 +++++++++++++-----
 src/data/ellpack_page_source.h                |  60 ++++--
 src/data/extmem_quantile_dmatrix.cc           |   2 +
 src/data/extmem_quantile_dmatrix.cu           |  22 +-
 src/data/sparse_page_dmatrix.cc               |  12 +-
 src/data/sparse_page_dmatrix.cu               |   2 +-
 src/data/sparse_page_dmatrix.h                |   1 +
 tests/cpp/c_api/test_c_api.cc                 |  10 +-
 .../cpp/data/test_ellpack_page_raw_format.cu  |  42 +++-
 .../cpp/data/test_extmem_quantile_dmatrix.cu  |  30 ++-
 tests/cpp/data/test_sparse_page_dmatrix.cc    |  21 +-
 tests/cpp/helpers.cc                          |  11 +
 tests/cpp/helpers.h                           |  11 +-
 tests/python-gpu/test_gpu_data_iterator.py    |  17 ++
 27 files changed, 424 insertions(+), 125 deletions(-)

diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 14e0a7f8df59..f9b848c0c337 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -162,7 +162,7 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
     # It's important to use RMM for GPU-based external memory to improve performance.
     # If XGBoost is not built with RMM support, a warning will be raised.
     # We use the pool memory resource here for simplicity, you can also try the
-    `ArenaMemoryResource` for # improved memory fragmentation handling.
+    # `ArenaMemoryResource` for # improved memory fragmentation handling.
     mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
     rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index b268e84f4ab4..b1c39ea0c685 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -588,6 +588,10 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
  *       help bound the memory usage. By default, XGBoost grows new sub-streams
  *       exponentially until batches are exhausted. Only used for the training dataset and
  *       the default is None (unbounded).
+ * - cache_host_ratio (optioinal): For GPU-based inputs, XGBoost can split the cache into
+ *      host and device portitions to reduce the data transfer overhead. This parameter
+ *      specifies the size of host cache compared to the size of the entire cache:
+ *      `host / (host + device)`.
  * @param out The created Quantile DMatrix.
  *
  * @return 0 when success, -1 when failure happens
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 33c3a926c96e..bc483a3949a7 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -532,20 +532,23 @@ struct ExtMemConfig {
   // Cache prefix, not used if the cache is in the host memory. (on_host is true)
   std::string cache;
   // Whether the ellpack page is stored in the host memory.
-  bool on_host{true};
+  bool on_host;
+  // Host cache/Total cache for the GPU impl.
+  float cache_host_ratio;
   // Minimum number of of bytes for each ellpack page in cache. Only used for in-host
   // ExtMemQdm.
-  std::int64_t min_cache_page_bytes{0};
+  std::int64_t min_cache_page_bytes;
   // Missing value.
-  float missing{std::numeric_limits<float>::quiet_NaN()};
+  float missing;
   // The number of CPU threads.
   std::int32_t n_threads{0};
 
-  ExtMemConfig() = default;
-  ExtMemConfig(std::string cache, bool on_host, std::int64_t min_cache, float missing,
-               std::int32_t n_threads)
+  ExtMemConfig() = delete;
+  ExtMemConfig(std::string cache, bool on_host, float h_ratio, std::int64_t min_cache,
+               float missing, std::int32_t n_threads)
       : cache{std::move(cache)},
         on_host{on_host},
+        cache_host_ratio{h_ratio},
         min_cache_page_bytes{min_cache},
         missing{missing},
         n_threads{n_threads} {}
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 7abdac4c1ffb..3101fe8c3f1d 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1821,6 +1821,7 @@ def __init__(  # pylint: disable=super-init-not-called
         ref: Optional[DMatrix] = None,
         enable_categorical: bool = False,
         max_quantile_batches: Optional[int] = None,
+        cache_host_ratio: Optional[float] = None,
     ) -> None:
         """
         Parameters
@@ -1831,6 +1832,15 @@ def __init__(  # pylint: disable=super-init-not-called
         max_quantile_batches :
             See :py:class:`QuantileDMatrix`.
 
+        cache_host_ratio :
+
+            .. versionadded:: 3.1.0
+
+            Used by the GPU implementation. For GPU-based inputs, XGBoost can split the
+            cache into host and device caches to reduce the data transfer overhead. This
+            parameter specifies the size of host cache compared to the size of the
+            entire cache: :math:`host / (host + device)`.
+
         """
         self.max_bin = max_bin
         self.missing = missing if missing is not None else np.nan
@@ -1841,6 +1851,9 @@ def __init__(  # pylint: disable=super-init-not-called
             ref,
             enable_categorical=enable_categorical,
             max_quantile_blocks=max_quantile_batches,
+            cache_host_ratio=(
+                None if cache_host_ratio is None else float(cache_host_ratio)
+            ),
         )
         assert self.handle is not None
 
@@ -1851,6 +1864,7 @@ def _init(
         *,
         enable_categorical: bool,
         max_quantile_blocks: Optional[int] = None,
+        cache_host_ratio: Optional[float] = None,
     ) -> None:
         args = make_jcargs(
             missing=self.missing,
@@ -1861,6 +1875,7 @@ def _init(
             min_cache_page_bytes=it.min_cache_page_bytes,
             # It's called blocks internally due to block-based quantile sketching.
             max_quantile_blocks=max_quantile_blocks,
+            cache_host_ratio=cache_host_ratio,
         )
         handle = ctypes.c_void_p()
         reset_callback, next_callback = it.get_callbacks(enable_categorical)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 02f367fceba9..d3277db49d09 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -325,12 +325,15 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
                                                                  cuda_impl::MatchingPageBytes());
   CHECK_EQ(min_cache_page_bytes, cuda_impl::MatchingPageBytes())
       << "Page concatenation is not supported by the DMatrix yet.";
+  auto cache_host_ratio =
+      OptionalArg<Number, float>(jconfig, "cache_host_ratio", cuda_impl::AutoHostRatio());
 
   xgboost_CHECK_C_ARG_PTR(next);
   xgboost_CHECK_C_ARG_PTR(reset);
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
+  auto config =
+      ExtMemConfig{cache, on_host, cache_host_ratio, min_cache_page_bytes, missing, n_threads};
   *out = new std::shared_ptr<xgboost::DMatrix>{
       xgboost::DMatrix::Create(iter, proxy, reset, next, config)};
   API_END();
@@ -393,12 +396,15 @@ XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
                                                                  cuda_impl::AutoCachePageBytes());
   auto max_quantile_blocks = OptionalArg<Integer, std::int64_t>(
       jconfig, "max_quantile_blocks", std::numeric_limits<std::int64_t>::max());
+  auto cache_host_ratio =
+      OptionalArg<Number, float>(jconfig, "cache_host_ratio", cuda_impl::AutoHostRatio());
 
   xgboost_CHECK_C_ARG_PTR(next);
   xgboost_CHECK_C_ARG_PTR(reset);
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
+  auto config =
+      ExtMemConfig{cache, on_host, cache_host_ratio, min_cache_page_bytes, missing, n_threads};
   *out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
       iter, proxy, p_ref, reset, next, max_bin, max_quantile_blocks, config)};
   API_END();
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 78168e1b1f13..c611683fc68b 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  *
  * \brief Common error message for various checks.
  */
@@ -135,5 +135,12 @@ constexpr StringView NoFloatCat() {
   return "Category index from DataFrame has floating point dtype, consider using strings or "
          "integers instead.";
 }
+
+constexpr StringView CacheHostRatioNotImpl() {
+  return "`cache_host_ratio` is only used by the GPU `ExtMemQuantileDMatrix`.";
+}
+constexpr StringView CacheHostRatioInvalid() {
+  return "`cache_host_ratio` must be in range [0, 1].";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/nvtx_utils.h b/src/common/nvtx_utils.h
index 3b27dc0664f6..14ad2637b25e 100644
--- a/src/common/nvtx_utils.h
+++ b/src/common/nvtx_utils.h
@@ -47,7 +47,18 @@ inline auto MakeScopedRange(StringView, Rgb) { return ScopedRange{}; }
 }  // namespace xgboost::nvtx
 
 #if defined(XGBOOST_USE_NVTX)
+
+// Macro for making NVTX function range.
 #define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::nvtx::Domain)
+
+// Macro for making colored NVTX function range.
+#define xgboost_NVTX_FN_RANGE_C(r, g, b) \
+  auto __nvtx_scoped__ = ::xgboost::nvtx::MakeScopedRange(__func__, (nvtx::Rgb((r), (g), (b))))
+
 #else
+
 #define xgboost_NVTX_FN_RANGE()
+
+#define xgboost_NVTX_FN_RANGE_C(r, g, b)
+
 #endif  // defined(XGBOOST_USE_NVTX)
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
index 1e338124cc67..1d87ab15f363 100644
--- a/src/common/ref_resource_view.cuh
+++ b/src/common/ref_resource_view.cuh
@@ -29,6 +29,9 @@ template <typename T>
   return ref;
 }
 
+/**
+ * @brief Initialize the data in addition to allocation.
+ */
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
                                                             std::size_t n_elements, T const& init) {
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index 3c33a839ab77..9ba61b4b13b6 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -68,7 +68,7 @@ class RefResourceView {
 
   [[nodiscard]] size_type size() const { return size_; }  // NOLINT
   [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span<const value_type>{data(), static_cast<size_t>(size())}.size_bytes();
+    return Span<const value_type>{data(), static_cast<std::size_t>(size())}.size_bytes();
   }
   [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
   [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
diff --git a/src/data/batch_utils.cc b/src/data/batch_utils.cc
index 926650f9fc8d..1ae54eb9738e 100644
--- a/src/data/batch_utils.cc
+++ b/src/data/batch_utils.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include "batch_utils.h"
 
@@ -11,4 +11,20 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
   CHECK(!param.regen && param.hess.empty())
       << "Only the `hist` tree method can use the `QuantileDMatrix`.";
 }
+
+[[nodiscard]] float DftHostRatio(float cache_host_ratio, bool is_validation) {
+  if (is_validation) {
+    // Don't split the cache if this is a validation dataset.
+    return 1.0;
+  }
+  if (HostRatioIsAuto(cache_host_ratio)) {
+    // Only NVML has the API to detect the topology. We will leave it as-is for now.
+    cache_host_ratio = 1.0;
+    return cache_host_ratio;
+  }
+  // Use user config.
+  CHECK_GE(cache_host_ratio, 0.0f) << error::CacheHostRatioInvalid();
+  CHECK_LE(cache_host_ratio, 1.0f) << error::CacheHostRatioInvalid();
+  return cache_host_ratio;
+}
 }  // namespace xgboost::data::detail
diff --git a/src/data/batch_utils.h b/src/data/batch_utils.h
index 7d4422e721d9..96c8a7095469 100644
--- a/src/data/batch_utils.h
+++ b/src/data/batch_utils.h
@@ -4,6 +4,9 @@
 #ifndef XGBOOST_DATA_BATCH_UTILS_H_
 #define XGBOOST_DATA_BATCH_UTILS_H_
 
+#include <cmath>   // for isnan
+#include <limits>  // for numeric_limits
+
 #include "xgboost/data.h"  // for BatchParam
 
 namespace xgboost::data::detail {
@@ -34,6 +37,15 @@ inline bool RegenGHist(BatchParam old, BatchParam p) {
  * @brief Validate the batch parameter from the caller
  */
 void CheckParam(BatchParam const& init, BatchParam const& param);
+
+/**
+ * @brief Get the default host ratio.
+ */
+[[nodiscard]] float DftHostRatio(float cache_host_ratio, bool is_validation);
+
+[[nodiscard]] inline bool HostRatioIsAuto(float cache_host_ratio) {
+  return std::isnan(cache_host_ratio);
+}
 }  // namespace xgboost::data::detail
 
 namespace xgboost::cuda_impl {
@@ -46,6 +58,8 @@ constexpr std::int64_t AutoCachePageBytes() { return -1; }
 // Use two batch for prefecting. There's always one batch being worked on, while the other
 // batch being transferred.
 constexpr auto DftPrefetchBatches() { return 2; }
+// The ratio of the cache split for external memory. Use -1 to indicate not-set.
+constexpr float AutoHostRatio() { return std::numeric_limits<float>::quiet_NaN(); }
 
 // Empty parameter to prevent regen, only used to control external memory prefetching.
 //
diff --git a/src/data/data.cc b/src/data/data.cc
index 026bb8eee240..78a74cf6b4c5 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -950,8 +950,12 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     CHECK(data_split_mode != DataSplitMode::kCol)
         << "Column-wise data split is not supported for external memory.";
     data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
-    auto config = ExtMemConfig{cache_file, false, cuda_impl::MatchingPageBytes(),
-                               std::numeric_limits<float>::quiet_NaN(), 1};
+    auto config = ExtMemConfig{cache_file,
+                               false,
+                               cuda_impl::AutoHostRatio(),
+                               cuda_impl::MatchingPageBytes(),
+                               std::numeric_limits<float>::quiet_NaN(),
+                               1};
     dmat = new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset,
                                        data::fileiter::Next, config};
   }
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 955cea2d5c88..b5dc16604b18 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -96,7 +96,7 @@ template <typename T>
 }
 
 [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const {
-  xgboost_NVTX_FN_RANGE();
+  xgboost_NVTX_FN_RANGE_C(252, 198, 3);
 
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
@@ -111,13 +111,14 @@ template <typename T>
 
 [[nodiscard]] std::size_t EllpackPageRawFormat::Write(EllpackPage const& page,
                                                       EllpackHostCacheStream* fo) const {
-  xgboost_NVTX_FN_RANGE();
+  xgboost_NVTX_FN_RANGE_C(3, 252, 198);
 
   bool new_page = fo->Write(page);
   dh::DefaultStream().Sync();
 
   if (new_page) {
-    return fo->Share()->pages.back()->MemCostBytes();
+    auto cache = fo->Share();
+    return cache->SizeBytes(cache->Size() - 1);  // last page
   } else {
     return InvalidPageSize();
   }
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index f2990e0166b0..690a2b0cca39 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -1,6 +1,7 @@
 /**
  * Copyright 2019-2025, XGBoost contributors
  */
+#include <algorithm>  // for max
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int8_t, uint64_t, uint32_t
 #include <memory>     // for shared_ptr, make_unique, make_shared
@@ -13,6 +14,7 @@
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
 #include "../common/resource.cuh"           // for PrivateCudaMmapConstStream
 #include "../common/transform_iterator.h"   // for MakeIndexTransformIter
+#include "batch_utils.h"                    // for HostRatioIsAuto
 #include "ellpack_page.cuh"                 // for EllpackPageImpl
 #include "ellpack_page.h"                   // for EllpackPage
 #include "ellpack_page_source.h"
@@ -26,20 +28,53 @@ namespace xgboost::data {
 EllpackMemCache::EllpackMemCache(EllpackCacheInfo cinfo)
     : cache_mapping{std::move(cinfo.cache_mapping)},
       buffer_bytes{std::move(cinfo.buffer_bytes)},
-      buffer_rows{std::move(cinfo.buffer_rows)} {
+      buffer_rows{std::move(cinfo.buffer_rows)},
+      cache_host_ratio{cinfo.cache_host_ratio} {
   CHECK_EQ(buffer_bytes.size(), buffer_rows.size());
+  CHECK(!detail::HostRatioIsAuto(this->cache_host_ratio));
+  CHECK_GE(this->cache_host_ratio, 0) << error::CacheHostRatioInvalid();
+  CHECK_LE(this->cache_host_ratio, 1) << error::CacheHostRatioInvalid();
 }
 
 EllpackMemCache::~EllpackMemCache() = default;
 
-[[nodiscard]] std::size_t EllpackMemCache::SizeBytes() const {
-  auto it = common::MakeIndexTransformIter([&](auto i) { return pages.at(i)->MemCostBytes(); });
+[[nodiscard]] std::size_t EllpackMemCache::SizeBytes() const noexcept(true) {
+  auto it = common::MakeIndexTransformIter([&](auto i) { return this->SizeBytes(i); });
   using T = std::iterator_traits<decltype(it)>::value_type;
-  return std::accumulate(it, it + pages.size(), static_cast<T>(0));
+  return std::accumulate(it, it + this->Size(), static_cast<T>(0));
 }
 
-[[nodiscard]] EllpackPageImpl const* EllpackMemCache::At(std::int32_t k) const {
-  return this->pages.at(k).get();
+[[nodiscard]] std::size_t EllpackMemCache::DeviceSizeBytes() const noexcept(true) {
+  auto it =
+      common::MakeIndexTransformIter([&](auto i) { return this->d_pages.at(i).size_bytes(); });
+  using T = std::iterator_traits<decltype(it)>::value_type;
+  return std::accumulate(it, it + this->Size(), static_cast<T>(0));
+}
+
+[[nodiscard]] std::size_t EllpackMemCache::SizeBytes(std::size_t i) const noexcept(true) {
+  return this->h_pages.at(i)->MemCostBytes() + this->d_pages.at(i).size_bytes();
+}
+
+[[nodiscard]] std::size_t EllpackMemCache::GidxSizeBytes(std::size_t i) const noexcept(true) {
+  return this->h_pages.at(i)->gidx_buffer.size_bytes() + this->d_pages.at(i).size_bytes();
+}
+
+[[nodiscard]] std::size_t EllpackMemCache::GidxSizeBytes() const noexcept(true) {
+  auto it = common::MakeIndexTransformIter([&](auto i) { return this->GidxSizeBytes(i); });
+  using T = std::iterator_traits<decltype(it)>::value_type;
+  return std::accumulate(it, it + this->Size(), static_cast<T>(0));
+}
+
+[[nodiscard]] EllpackMemCache::PagePtr EllpackMemCache::At(std::int32_t k) const {
+  auto const* h_ptr = this->h_pages.at(k).get();
+  auto const* d_ptr = &this->d_pages.at(k);
+  return std::make_pair(h_ptr, d_ptr);
+}
+
+[[nodiscard]] EllpackMemCache::PageRef EllpackMemCache::Back() {
+  auto& h_ref = this->h_pages.back();
+  auto& d_ref = this->d_pages.back();
+  return {h_ref, d_ref};
 }
 
 /**
@@ -53,20 +88,20 @@ class EllpackHostCacheStreamImpl {
   explicit EllpackHostCacheStreamImpl(std::shared_ptr<EllpackMemCache> cache)
       : cache_{std::move(cache)} {}
 
-  auto Share() { return cache_; }
+  auto Share() const { return this->cache_; }
 
   void Seek(bst_idx_t offset_bytes) {
     std::size_t n_bytes{0};
     std::int32_t k{-1};
-    for (std::size_t i = 0, n = cache_->pages.size(); i < n; ++i) {
+    for (std::size_t i = 0, n = cache_->h_pages.size(); i < n; ++i) {
       if (n_bytes == offset_bytes) {
         k = i;
         break;
       }
-      n_bytes += cache_->pages[i]->MemCostBytes();
+      n_bytes += this->cache_->SizeBytes(i);
     }
     if (offset_bytes == n_bytes && k == -1) {
-      k = this->cache_->pages.size();  // seek end
+      k = this->cache_->h_pages.size();  // seek end
     }
     CHECK_NE(k, -1) << "Invalid offset:" << offset_bytes;
     ptr_ = k;
@@ -82,50 +117,80 @@ class EllpackHostCacheStreamImpl {
     CHECK_LT(orig_ptr, this->cache_->NumBatchesOrig());
     auto cache_idx = this->cache_->cache_mapping.at(orig_ptr);
     // Wrap up the previous page if this is a new page, or this is the last page.
-    auto new_page = cache_idx == this->cache_->pages.size();
+    auto new_page = cache_idx == this->cache_->h_pages.size();
     // Last page expected from the user.
     auto last_page = (orig_ptr + 1) == this->cache_->NumBatchesOrig();
 
     bool const no_concat = this->cache_->NoConcat();
 
-    auto commit_host_page = [](EllpackPageImpl const* old_impl) {
+    auto cache_host_ratio = this->cache_->cache_host_ratio;
+    CHECK_GE(cache_host_ratio, 0) << error::CacheHostRatioInvalid();
+    CHECK_LE(cache_host_ratio, 1) << error::CacheHostRatioInvalid();
+
+    // Get the size of the host cache.
+    auto get_host_nbytes = [&](EllpackPageImpl const* old_impl) {
+      if (this->cache_->cache_host_ratio == 1.0) {
+        return old_impl->gidx_buffer.size_bytes();
+      }
+      if (this->cache_->cache_host_ratio == 0.0) {
+        return static_cast<std::size_t>(0);
+      }
+      auto n_bytes =
+          std::max(static_cast<std::size_t>(old_impl->gidx_buffer.size_bytes() * cache_host_ratio),
+                   std::size_t{1});
+      return n_bytes;
+    };
+    // Finish writing a (concatenated) cache page.
+    auto commit_page = [cache_host_ratio, get_host_nbytes](EllpackPageImpl const* old_impl) {
       CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc);
       auto new_impl = std::make_unique<EllpackPageImpl>();
       new_impl->CopyInfo(old_impl);
-      new_impl->gidx_buffer = common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(
-          old_impl->gidx_buffer.size());
-      dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), old_impl->gidx_buffer.data(),
-                                    old_impl->gidx_buffer.size_bytes(), cudaMemcpyDefault));
-      LOG(INFO) << "Create cache page with size:" << common::HumanMemUnit(new_impl->MemCostBytes());
-      return new_impl;
+      // Split the cache into host cache and device cache
+
+      // Host cache
+      auto n_bytes = get_host_nbytes(old_impl);
+      CHECK_LE(n_bytes, old_impl->gidx_buffer.size_bytes());
+      new_impl->gidx_buffer =
+          common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(n_bytes);
+      if (n_bytes > 0) {
+        dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), old_impl->gidx_buffer.data(),
+                                      n_bytes, cudaMemcpyDefault));
+      }
+
+      // Device cache
+      auto remaining = old_impl->gidx_buffer.size_bytes() - n_bytes;
+      auto d_page = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(remaining);
+      if (remaining > 0) {
+        dh::safe_cuda(cudaMemcpyAsync(d_page.data(), old_impl->gidx_buffer.data() + n_bytes,
+                                      remaining, cudaMemcpyDefault));
+      }
+      CHECK_LE(new_impl->gidx_buffer.size(), old_impl->gidx_buffer.size());
+      CHECK_EQ(new_impl->MemCostBytes() + d_page.size_bytes(), old_impl->MemCostBytes());
+      LOG(INFO) << "Create cache page with size:"
+                << common::HumanMemUnit(new_impl->MemCostBytes() + d_page.size_bytes());
+      return std::make_pair(std::move(new_impl), std::move(d_page));
     };
+
     if (no_concat) {
-      // Avoid a device->device->host copy.
       CHECK(new_page);
-      auto new_impl = std::make_unique<EllpackPageImpl>();
-      new_impl->CopyInfo(page.Impl());
+      auto old_impl = page.Impl();
+      auto [commited, d_page] = commit_page(old_impl);
 
-      // Copy to host
-      new_impl->gidx_buffer = common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(
-          page.Impl()->gidx_buffer.size());
-      dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), page.Impl()->gidx_buffer.data(),
-                                    page.Impl()->gidx_buffer.size_bytes(), cudaMemcpyDefault));
-
-      this->cache_->offsets.push_back(new_impl->n_rows * new_impl->info.row_stride);
-      this->cache_->pages.push_back(std::move(new_impl));
+      this->cache_->offsets.push_back(old_impl->n_rows * old_impl->info.row_stride);
+      this->cache_->h_pages.emplace_back(std::move(commited));
+      this->cache_->d_pages.emplace_back(std::move(d_page));
       return new_page;
     }
 
     if (new_page) {
-      if (!this->cache_->pages.empty()) {
+      if (!this->cache_->h_pages.empty()) {
         // Need to wrap up the previous page.
-        auto commited = commit_host_page(this->cache_->pages.back().get());
         // Replace the previous page (on device) with a new page on host.
-        this->cache_->pages.back() = std::move(commited);
+        this->cache_->Back() = commit_page(this->cache_->h_pages.back().get());
       }
       // Push a new page
-      auto n_bytes = this->cache_->buffer_bytes.at(this->cache_->pages.size());
-      auto n_samples = this->cache_->buffer_rows.at(this->cache_->pages.size());
+      auto n_bytes = this->cache_->buffer_bytes.at(this->cache_->h_pages.size());
+      auto n_samples = this->cache_->buffer_rows.at(this->cache_->h_pages.size());
       auto new_impl = std::make_unique<EllpackPageImpl>(&ctx, impl->CutsShared(), impl->IsDense(),
                                                         impl->info.row_stride, n_samples);
       new_impl->SetBaseRowId(impl->base_rowid);
@@ -136,38 +201,59 @@ class EllpackHostCacheStreamImpl {
 
       this->cache_->offsets.push_back(offset);
 
-      this->cache_->pages.push_back(std::move(new_impl));
+      // Make sure we can always access the back of the vectors
+      this->cache_->h_pages.emplace_back(std::move(new_impl));
+      this->cache_->d_pages.emplace_back();
     } else {
-      CHECK(!this->cache_->pages.empty());
-      CHECK_EQ(cache_idx, this->cache_->pages.size() - 1);
-      auto& new_impl = this->cache_->pages.back();
+      // Concatenate into the device pages even though `d_pages` is used. We split the
+      // page at the commit stage.
+      CHECK(!this->cache_->h_pages.empty());
+      CHECK_EQ(cache_idx, this->cache_->h_pages.size() - 1);
+      auto& new_impl = this->cache_->h_pages.back();
       auto offset = new_impl->Copy(&ctx, impl, this->cache_->offsets.back());
       this->cache_->offsets.back() += offset;
     }
 
+    // No need to copy if it's already in device.
     if (last_page) {
-      auto commited = commit_host_page(this->cache_->pages.back().get());
-      this->cache_->pages.back() = std::move(commited);
+      this->cache_->Back() = commit_page(this->cache_->h_pages.back().get());
     }
 
+    CHECK_EQ(this->cache_->h_pages.size(), this->cache_->d_pages.size());
     return new_page;
   }
 
   void Read(EllpackPage* out, bool prefetch_copy) const {
-    auto page = this->cache_->At(this->ptr_);
+    CHECK_EQ(this->cache_->h_pages.size(), this->cache_->d_pages.size());
+    auto [h_page, d_page] = this->cache_->At(this->ptr_);
+
+    auto ctx = Context{}.MakeCUDA(dh::CurrentDevice());
+    // FIXME(jiamingy): Accessing split cache directly is not yet supported.
+    if (0.0 < this->cache_->cache_host_ratio && this->cache_->cache_host_ratio < 1.0) {
+      prefetch_copy = true;
+    }
     auto out_impl = out->Impl();
     if (prefetch_copy) {
-      out_impl->gidx_buffer =
-          common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(page->gidx_buffer.size());
-      dh::safe_cuda(cudaMemcpyAsync(out_impl->gidx_buffer.data(), page->gidx_buffer.data(),
-                                    page->gidx_buffer.size_bytes(), cudaMemcpyDefault));
+      auto n_bytes = this->cache_->GidxSizeBytes(this->ptr_);
+      out_impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(n_bytes);
+      if (!h_page->gidx_buffer.empty()) {
+        dh::safe_cuda(cudaMemcpyAsync(out_impl->gidx_buffer.data(), h_page->gidx_buffer.data(),
+                                      h_page->gidx_buffer.size_bytes(), cudaMemcpyDefault,
+                                      ctx.CUDACtx()->Stream()));
+      }
+      if (!d_page->empty()) {
+        auto beg = out_impl->gidx_buffer.data() + h_page->gidx_buffer.size();
+        dh::safe_cuda(cudaMemcpyAsync(beg, d_page->data(), d_page->size_bytes(), cudaMemcpyDefault,
+                                      ctx.CUDACtx()->Stream()));
+      }
     } else {
-      auto res = page->gidx_buffer.Resource();
+      CHECK(d_page->empty() || h_page->gidx_buffer.empty());
+      auto res = d_page->empty() ? h_page->gidx_buffer.Resource() : d_page->Resource();
       out_impl->gidx_buffer = common::RefResourceView<common::CompressedByteT>{
-          res->DataAs<common::CompressedByteT>(), page->gidx_buffer.size(), res};
+          res->DataAs<common::CompressedByteT>(), h_page->gidx_buffer.size(), res};
     }
 
-    out_impl->CopyInfo(page);
+    out_impl->CopyInfo(h_page);
   }
 };
 
@@ -200,6 +286,9 @@ template <typename S, template <typename> typename F>
 [[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::WriterT>
 EllpackCacheStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
   if (!this->p_cache_) {
+    CHECK(!detail::HostRatioIsAuto(this->CacheInfo().cache_host_ratio));
+    CHECK_GE(this->CacheInfo().cache_host_ratio, 0.0);
+    CHECK_LE(this->CacheInfo().cache_host_ratio, 1.0);
     this->p_cache_ = std::make_unique<EllpackMemCache>(this->CacheInfo());
   }
   auto fo = std::make_unique<EllpackHostCacheStream>(this->p_cache_);
@@ -287,6 +376,11 @@ void CalcCacheMapping(Context const* ctx, bool is_dense,
   cinfo->cache_mapping = std::move(cache_mapping);
   cinfo->buffer_bytes = std::move(cache_bytes);
   cinfo->buffer_rows = std::move(cache_rows);
+  // Directly store in device if there's only one batch.
+  if (cinfo->NumBatchesCc() == 1) {
+    cinfo->cache_host_ratio = 0.0;
+    LOG(INFO) << "Prefer device cache as there's only 1 page.";
+  }
 }
 
 /**
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index caa5963d85a8..681f6e067850 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -11,26 +11,30 @@
 #include <utility>  // for move
 #include <vector>   // for vector
 
-#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem, SupportsAts
-#include "../common/hist_util.h"      // for HistogramCuts
-#include "ellpack_page.h"             // for EllpackPage
-#include "ellpack_page_raw_format.h"  // for EllpackPageRawFormat
-#include "sparse_page_source.h"       // for PageSourceIncMixIn
-#include "xgboost/base.h"             // for bst_idx_t
-#include "xgboost/context.h"          // for DeviceOrd
-#include "xgboost/data.h"             // for BatchParam
-#include "xgboost/span.h"             // for Span
+#include "../common/compressed_iterator.h"  // for CompressedByteT
+#include "../common/cuda_rt_utils.h"        // for SupportsPageableMem, SupportsAts
+#include "../common/hist_util.h"            // for HistogramCuts
+#include "../common/ref_resource_view.h"    // for RefResourceView
+#include "ellpack_page.h"                   // for EllpackPage
+#include "ellpack_page_raw_format.h"        // for EllpackPageRawFormat
+#include "sparse_page_source.h"             // for PageSourceIncMixIn
+#include "xgboost/base.h"                   // for bst_idx_t
+#include "xgboost/context.h"                // for DeviceOrd
+#include "xgboost/data.h"                   // for BatchParam
+#include "xgboost/span.h"                   // for Span
 
 namespace xgboost::data {
 struct EllpackCacheInfo {
   BatchParam param;
+  float cache_host_ratio{1.0};  // The size ratio the host cache vs. the total cache
   float missing{std::numeric_limits<float>::quiet_NaN()};
   std::vector<bst_idx_t> cache_mapping;
   std::vector<bst_idx_t> buffer_bytes;  // N bytes of the concatenated pages.
   std::vector<bst_idx_t> buffer_rows;
 
   EllpackCacheInfo() = default;
-  EllpackCacheInfo(BatchParam param, float missing) : param{std::move(param)}, missing{missing} {}
+  EllpackCacheInfo(BatchParam param, float h_ratio, float missing)
+      : param{std::move(param)}, cache_host_ratio{h_ratio}, missing{missing} {}
 
   // Only effective for host-based cache.
   // The number of batches for the concatenated cache.
@@ -41,9 +45,17 @@ struct EllpackCacheInfo {
 // concurrent read. As a result, there are two classes, one for cache storage, another one
 // for stream.
 //
-// This is a memory-based cache. It can be a mixed of the device memory and the host memory.
+// This is a memory-based cache. It can be a mixed of the device memory and the host
+// memory.
 struct EllpackMemCache {
-  std::vector<std::unique_ptr<EllpackPageImpl>> pages;
+  // The host portion of each page.
+  std::vector<std::unique_ptr<EllpackPageImpl>> h_pages;
+  // The device portion of each page.
+  using DPage = common::RefResourceView<common::CompressedByteT>;
+  std::vector<DPage> d_pages;
+  using PagePtr = std::pair<EllpackPageImpl const*, DPage const*>;
+  using PageRef = std::pair<std::unique_ptr<EllpackPageImpl>&, DPage&>;
+
   std::vector<std::size_t> offsets;
   // Size of each batch before concatenation.
   std::vector<bst_idx_t> sizes_orig;
@@ -52,20 +64,34 @@ struct EllpackMemCache {
   // Cache info
   std::vector<std::size_t> const buffer_bytes;
   std::vector<bst_idx_t> const buffer_rows;
+  float const cache_host_ratio;
 
   explicit EllpackMemCache(EllpackCacheInfo cinfo);
   ~EllpackMemCache();
 
-  // The number of bytes for the entire cache.
-  [[nodiscard]] std::size_t SizeBytes() const;
-
+  // The number of bytes of the entire cache.
+  [[nodiscard]] std::size_t SizeBytes() const noexcept(true);
+  // The number of bytes of the device cache.
+  [[nodiscard]] std::size_t DeviceSizeBytes() const noexcept(true);
+  // The number of bytes of each page.
+  [[nodiscard]] std::size_t SizeBytes(std::size_t i) const noexcept(true);
+  // The number of bytes of the gradient index (ellpack).
+  [[nodiscard]] std::size_t GidxSizeBytes(std::size_t i) const noexcept(true);
+  // The number of bytes of the gradient index (ellpack) of the entire cache.
+  [[nodiscard]] std::size_t GidxSizeBytes() const noexcept(true);
+  // The number of pages in the cache.
+  [[nodiscard]] std::size_t Size() const { return this->h_pages.size(); }
+  // Is the cache empty?
   [[nodiscard]] bool Empty() const { return this->SizeBytes() == 0; }
   // No page concatenation is performed. If there's page concatenation, then the number of
   // pages in the cache must be smaller than the input number of pages.
   [[nodiscard]] bool NoConcat() const { return this->NumBatchesOrig() == this->buffer_rows.size(); }
-
+  // The number of pages before concatenatioin.
   [[nodiscard]] bst_idx_t NumBatchesOrig() const { return cache_mapping.size(); }
-  [[nodiscard]] EllpackPageImpl const* At(std::int32_t k) const;
+  // Get the pointers to the k^th concatenated page.
+  [[nodiscard]] PagePtr At(std::int32_t k) const;
+  // Get a reference to the last concatenated page.
+  [[nodiscard]] PageRef Back();
 };
 
 // Pimpl to hide CUDA calls from the host compiler.
diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc
index 25b73cfe9700..eacc98cd7b6e 100644
--- a/src/data/extmem_quantile_dmatrix.cc
+++ b/src/data/extmem_quantile_dmatrix.cc
@@ -7,6 +7,7 @@
 #include <string>  // for string
 #include <vector>  // for vector
 
+#include "../common/error_msg.h"    // for CacheHostRatio, InconsistentMaxBin
 #include "../tree/param.h"          // FIXME(jiamingy): Find a better way to share this parameter.
 #include "batch_utils.h"            // for CheckParam, RegenGHist
 #include "proxy_dmatrix.h"          // for DataIterProxy
@@ -40,6 +41,7 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix
 
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
   if (ctx.IsCPU()) {
+    CHECK(detail::HostRatioIsAuto(config.cache_host_ratio)) << error::CacheHostRatioNotImpl();
     this->InitFromCPU(&ctx, iter, proxy, p, config.missing, ref);
   } else {
     p.n_prefetch_batches = ::xgboost::cuda_impl::DftPrefetchBatches();
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
index 44aa3346e17c..af48ebf563ba 100644
--- a/src/data/extmem_quantile_dmatrix.cu
+++ b/src/data/extmem_quantile_dmatrix.cu
@@ -6,13 +6,13 @@
  * concatenate user-provded pages to form larger @ref EllpackPage to avoid small GPU
  * kernels.
  *
- * Given 1 training DMatrix and 1 validation DMatrix, with 2 pages from the validation set
- * cached in device memory, we can have at most 6 pages in the device memory. 2 from
- * prefetched training DMatrix, 2 from prefetched validation DMatrix, and 2 in the device
- * cache. If set the minimum @ref EllpackPage to 12GB in a 96GB GPU, 6 pages have 72GB
- * size in total. Without accounting for memory fragmentation, this should be very close
- * the upper boundary.
+ * Given 1 training DMatrix and 1 validation DMatrix, we can have at most 4 pages in the
+ * device memory. 2 from prefetched training DMatrix, 2 from prefetched validation
+ * DMatrix. If set the minimum @ref EllpackPage to 12GB in a 96GB GPU, 4 pages have 48GB
+ * size in total. Accounting for memory fragmentation, we still have some room in the
+ * device that can be used as a faster cache.
  */
+
 #include <memory>   // for shared_ptr
 #include <variant>  // for visit, get_if
 
@@ -26,6 +26,7 @@
 #include "xgboost/data.h"     // for BatchParam
 
 namespace xgboost::data {
+namespace detail {
 [[nodiscard]] std::int64_t DftMinCachePageBytes(std::int64_t min_cache_page_bytes) {
   // Set to 0 if it should match the user input size.
   if (::xgboost::cuda_impl::AutoCachePageBytes() == min_cache_page_bytes) {
@@ -34,6 +35,7 @@ namespace xgboost::data {
   }
   return min_cache_page_bytes;
 }
+}  // namespace detail
 
 void ExtMemQuantileDMatrix::InitFromCUDA(
     Context const *ctx,
@@ -58,10 +60,14 @@ void ExtMemQuantileDMatrix::InitFromCUDA(
   /**
    * Calculate cache info
    */
-  auto cinfo = EllpackCacheInfo{p, config.missing};
+  auto is_validation = (ref != nullptr);
+  auto cinfo = EllpackCacheInfo{p, detail::DftHostRatio(config.cache_host_ratio, is_validation),
+                                config.missing};
   CalcCacheMapping(ctx, this->info_.IsDense(), cuts,
-                   DftMinCachePageBytes(config.min_cache_page_bytes), ext_info, &cinfo);
+                   detail::DftMinCachePageBytes(config.min_cache_page_bytes), ext_info, &cinfo);
   CHECK_EQ(cinfo.cache_mapping.size(), ext_info.n_batches);
+  CHECK_GE(cinfo.cache_host_ratio, 0.0);
+  CHECK_LE(cinfo.cache_host_ratio, 1.0);
   auto n_batches = cinfo.NumBatchesCc();
   LOG(INFO) << "Number of batches after concatenation:" << n_batches;
 
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 160602549324..dddb3058c914 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -13,11 +13,11 @@
 #include <utility>    // for move
 #include <variant>    // for visit
 
-#include "batch_utils.h"         // for RegenGHist
-#include "cat_container.h"       // for CatContainer
-#include "gradient_index.h"      // for GHistIndexMatrix
-#include "sparse_page_source.h"  // for MakeCachePrefix
-#include "../common/error_msg.h"             // for InconsistentCategories
+#include "../common/error_msg.h"  // for InconsistentCategories, CacheHostRatio
+#include "batch_utils.h"          // for RegenGHist
+#include "cat_container.h"        // for CatContainer
+#include "gradient_index.h"       // for GHistIndexMatrix
+#include "sparse_page_source.h"   // for MakeCachePrefix
 
 namespace xgboost::data {
 MetaInfo &SparsePageDMatrix::Info() { return info_; }
@@ -34,7 +34,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
       missing_{config.missing},
       cache_prefix_{config.cache},
       on_host_{config.on_host},
+      cache_host_ratio_{detail::DftHostRatio(config.cache_host_ratio, true)},
       min_cache_page_bytes_{config.min_cache_page_bytes} {
+  CHECK(detail::HostRatioIsAuto(config.cache_host_ratio)) << error::CacheHostRatioNotImpl();
   Context ctx;
   ctx.Init(Args{{"nthread", std::to_string(config.n_threads)}});
   cache_prefix_ = MakeCachePrefix(cache_prefix_);
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index f22e5136d558..568f99f9a369 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -53,7 +53,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
       ellpack_page_source_.emplace<EllpackHostPtr>(nullptr);
     }
 
-    auto cinfo = EllpackCacheInfo{param, this->missing_};
+    auto cinfo = EllpackCacheInfo{param, this->cache_host_ratio_, this->missing_};
     CalcCacheMapping(ctx, this->IsDense(), cuts, min_cache_page_bytes_, this->ext_info_, &cinfo);
     CHECK_EQ(cinfo.cache_mapping.size(), this->ext_info_.n_batches)
         << "Page concatenation is only supported by the `ExtMemQuantileDMatrix`.";
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
index b46c9bb03924..592c86b6b28f 100644
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -74,6 +74,7 @@ class SparsePageDMatrix : public DMatrix {
   Context fmat_ctx_;
   std::string cache_prefix_;
   bool const on_host_;
+  float const cache_host_ratio_;
   std::int64_t const min_cache_page_bytes_;
   ExternalDataInfo ext_info_;
 
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 9f298e548050..2d1b2f3c5c6c 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -20,7 +20,7 @@
 #include "../../../src/common/io.h"
 #include "../../../src/data/adapter.h"              // for ArrayAdapter
 #include "../../../src/data/array_interface.h"      // for ArrayInterface
-#include "../../../src/data/batch_utils.h"          // for MatchingPageBytes
+#include "../../../src/data/batch_utils.h"          // for MatchingPageBytes, DftHostRatio
 #include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
 #include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
 #include "../../../src/data/proxy_dmatrix.h"        // for DMatrixProxy
@@ -470,8 +470,12 @@ auto MakeExtMemForTest(bst_idx_t n_samples, bst_feature_t n_features, Json dconf
            0);
 
   NumpyArrayIterForTest iter_1{0.0f, n_samples, n_features, n_batches};
-  auto config = ExtMemConfig{"", false, cuda_impl::MatchingPageBytes(),
-                             std::numeric_limits<float>::quiet_NaN(), 0};
+  auto config = ExtMemConfig{"",
+                             false,
+                             cuda_impl::AutoHostRatio(),
+                             cuda_impl::MatchingPageBytes(),
+                             std::numeric_limits<float>::quiet_NaN(),
+                             0};
   auto Xy = std::make_shared<data::SparsePageDMatrix>(&iter_1, iter_1.Proxy(), Reset, Next, config);
   MakeLabelForTest(Xy, p_fmat);
   return std::pair{p_fmat, Xy};
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 4acf4b6c4514..002f794690a1 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -8,6 +8,7 @@
 #include "../../../src/data/ellpack_page_raw_format.h"  // for EllpackPageRawFormat
 #include "../../../src/data/ellpack_page_source.h"      // for EllpackFormatStreamPolicy
 #include "../../../src/tree/param.h"                    // for TrainParam
+#include "../../../src/data/batch_utils.h"              // for DftHostRatio
 #include "../filesystem.h"                              // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
@@ -16,7 +17,8 @@ namespace {
 [[nodiscard]] EllpackCacheInfo CInfoForTest(Context const *ctx, DMatrix *Xy, bst_idx_t row_stride,
                                             BatchParam param,
                                             std::shared_ptr<common::HistogramCuts const> cuts) {
-  EllpackCacheInfo cinfo{param, std::numeric_limits<float>::quiet_NaN()};
+  EllpackCacheInfo cinfo{param, detail::DftHostRatio(::xgboost::cuda_impl::AutoHostRatio(), false),
+                         std::numeric_limits<float>::quiet_NaN()};
   ExternalDataInfo ext_info;
   ext_info.n_batches = 1;
   ext_info.row_stride = row_stride;
@@ -24,6 +26,10 @@ namespace {
 
   CalcCacheMapping(ctx, Xy->IsDense(), cuts, 0, ext_info, &cinfo);
   CHECK_EQ(ext_info.n_batches, cinfo.cache_mapping.size());
+  if (cinfo.NumBatchesCc() == 1) {
+    EXPECT_EQ(cinfo.cache_host_ratio, 0.0);
+    cinfo.cache_host_ratio = 1.0;  // We test the host cache.
+  }
   return cinfo;
 }
 
@@ -116,7 +122,9 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
       for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
         if (!format) {
           // Prepare the mapping info.
-          EllpackCacheInfo cinfo{param, std::numeric_limits<float>::quiet_NaN()};
+          EllpackCacheInfo cinfo{param,
+                                 detail::DftHostRatio(::xgboost::cuda_impl::AutoHostRatio(), false),
+                                 std::numeric_limits<float>::quiet_NaN()};
           for (std::size_t i = 0; i < 3; ++i) {
             cinfo.cache_mapping.push_back(i);
             cinfo.buffer_bytes.push_back(page.Impl()->MemCostBytes());
@@ -163,8 +171,10 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   bst_idx_t n_features = 16, n_samples = 128;
 
-  auto test = [&](std::int64_t min_cache_page_bytes) {
-    EllpackCacheInfo cinfo{param, std::numeric_limits<float>::quiet_NaN()};
+  auto test = [&](std::int64_t min_cache_page_bytes, float cache_host_ratio) {
+    EllpackCacheInfo cinfo{param,
+                           detail::DftHostRatio(cache_host_ratio, false),
+                           std::numeric_limits<float>::quiet_NaN()};
     ExternalDataInfo ext_info;
 
     ext_info.n_batches = 8;
@@ -182,28 +192,44 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
 
     for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
       auto cuts = page.Impl()->CutsShared();
-      CalcCacheMapping(&ctx, true, cuts, min_cache_page_bytes, ext_info, &cinfo);
+      EXPECT_TRUE(page.Impl()->IsDense());
+      CalcCacheMapping(&ctx, page.Impl()->IsDense(), cuts, min_cache_page_bytes, ext_info, &cinfo);
       EXPECT_EQ(cinfo.buffer_rows.size(), 4ul);
       policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
     }
 
     auto format = policy.CreatePageFormat(param);
 
-    // write multipe pages
+    // write multipe identical pages
+    std::size_t n_gidx_total_bytes = 0;
     for (bst_idx_t i = 0; i < ext_info.n_batches; ++i) {
       for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
         auto writer = policy.CreateWriter({}, i);
         [[maybe_unused]] auto n_bytes = format->Write(page, writer.get());
+        n_gidx_total_bytes += page.Impl()->gidx_buffer.size_bytes();
       }
     }
     // check correct concatenation.
     auto mem_cache = policy.Share();
+    EXPECT_EQ(mem_cache->GidxSizeBytes(), n_gidx_total_bytes);
     return mem_cache;
   };
 
   {
-    auto mem_cache = test(n_features * n_samples);
-    ASSERT_EQ(mem_cache->pages.size(), 4);
+    auto mem_cache = test(n_features * n_samples, ::xgboost::cuda_impl::AutoHostRatio());
+    ASSERT_EQ(mem_cache->h_pages.size(), 4);
+    ASSERT_EQ(mem_cache->d_pages.size(), 4);
+    ASSERT_TRUE(mem_cache->d_pages[0].empty());
+  }
+  {
+    float cache_host_ratio = 0.65;
+    auto mem_cache = test(n_features * n_samples, cache_host_ratio);
+    ASSERT_EQ(mem_cache->h_pages.size(), 4);
+    ASSERT_EQ(mem_cache->d_pages.size(), 4);
+    ASSERT_FALSE(mem_cache->d_pages[0].empty());
+    auto n_total_bytes = mem_cache->SizeBytes();
+    ASSERT_LT(mem_cache->DeviceSizeBytes(), n_total_bytes - (n_total_bytes * cache_host_ratio));
+    ASSERT_GT(mem_cache->DeviceSizeBytes(), n_total_bytes - (n_total_bytes * 0.7));
   }
 }
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index 6d2b897a250d..1b52ae0827d6 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>  // for BatchParam
@@ -7,9 +7,11 @@
 #include <tuple>   // for tuple
 #include <vector>  // for vector
 
-#include "../../../src/data/ellpack_page.cuh"  // for EllpackPageImpl
-#include "../helpers.h"                        // for RandomDataGenerator
-#include "test_extmem_quantile_dmatrix.h"      // for TestExtMemQdmBasic
+#include "../../../src/data/batch_utils.h"              // for AutoHostRatio
+#include "../../../src/data/ellpack_page.cuh"           // for EllpackPageImpl
+#include "../../../src/data/extmem_quantile_dmatrix.h"  // for DftHostRatio
+#include "../helpers.h"                                 // for RandomDataGenerator, GMockThrow
+#include "test_extmem_quantile_dmatrix.h"               // for TestExtMemQdmBasic
 
 namespace xgboost::data {
 auto AssertEllpackEq(Context const* ctx, EllpackPageImpl const* lhs, EllpackPageImpl const* rhs) {
@@ -54,7 +56,7 @@ INSTANTIATE_TEST_SUITE_P(ExtMemQuantileDMatrix, ExtMemQuantileDMatrixGpu,
                          ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.4f, 0.8f),
                                             ::testing::Bool()));
 
-class EllpackHostCacheTest : public ::testing::TestWithParam<std::tuple<double, bool>> {
+class EllpackHostCacheTest : public ::testing::TestWithParam<std::tuple<double, bool, float>> {
  public:
   static constexpr bst_idx_t NumSamples() { return 8192; }
   static constexpr bst_idx_t NumFeatures() { return 4; }
@@ -62,7 +64,7 @@ class EllpackHostCacheTest : public ::testing::TestWithParam<std::tuple<double,
   // Assumes dense
   static constexpr bst_idx_t NumBytes() { return NumFeatures() * NumSamples(); }
 
-  void Run(float sparsity, bool is_concat) {
+  void Run(float sparsity, bool is_concat, float cache_host_ratio) {
     auto ctx = MakeCUDACtx(0);
     auto param = BatchParam{NumBins(), tree::TrainParam::DftSparseThreshold()};
     auto n_batches = 4;
@@ -81,6 +83,7 @@ class EllpackHostCacheTest : public ::testing::TestWithParam<std::tuple<double,
                           .Device(ctx.Device())
                           .OnHost(true)
                           .MinPageCacheBytes(min_page_cache_bytes)
+                          .CacheHostRatio(cache_host_ratio)
                           .GenerateExtMemQuantileDMatrix("temp", true);
     if (!is_concat) {
       ASSERT_EQ(p_ext_fmat->NumBatches(), n_batches);
@@ -106,11 +109,20 @@ class EllpackHostCacheTest : public ::testing::TestWithParam<std::tuple<double,
 
 TEST_P(EllpackHostCacheTest, Basic) {
   auto ctx = MakeCUDACtx(0);
-  auto [sparsity, min_page_cache_bytes] = this->GetParam();
-  this->Run(sparsity, min_page_cache_bytes);
+  auto [sparsity, min_page_cache_bytes, cache_host_ratio] = this->GetParam();
+  this->Run(sparsity, min_page_cache_bytes, cache_host_ratio);
 }
 
 INSTANTIATE_TEST_SUITE_P(ExtMemQuantileDMatrix, EllpackHostCacheTest,
                          ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.4f, 0.8f),
-                                            ::testing::Bool()));
+                                            ::testing::Bool(),
+                                            ::testing::Values(0.0f, 0.5f, 1.0f)));
+
+TEST(ExtMemQuantileDMatrixGpu, CacheHostRatio) {
+  auto cache_host_ratio = detail::DftHostRatio(::xgboost::cuda_impl::AutoHostRatio(), false);
+  ASSERT_GT(cache_host_ratio, 0.0);
+  ASSERT_LE(cache_host_ratio, 1.0);
+  ASSERT_THAT([&] { [[maybe_unused]] auto r = detail::DftHostRatio(2.0, false); },
+              GMockThrow(R"(cache_host_ratio)"));
+}
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index f69f8b918733..0e9c30b9da32 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -9,12 +9,12 @@
 
 #include "../../../src/common/io.h"
 #include "../../../src/data/adapter.h"
+#include "../../../src/data/batch_utils.h"  // for MatchingPageBytes, DftHostRatio
 #include "../../../src/data/file_iterator.h"
 #include "../../../src/data/simple_dmatrix.h"
-#include "../../../src/data/batch_utils.h"  // for MatchingPageBytes
 #include "../../../src/data/sparse_page_dmatrix.h"
 #include "../../../src/tree/param.h"  // for TrainParam
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 using namespace xgboost;  // NOLINT
@@ -32,8 +32,13 @@ void TestSparseDMatrixLoadFile(Context const* ctx) {
   opath += "?indexing_mode=1&format=libsvm";
   data::FileIterator iter{opath, 0, 1};
   auto n_threads = 0;
-  auto config = ExtMemConfig{tmpdir.path + "cache", false, cuda_impl::MatchingPageBytes(),
-                             std::numeric_limits<float>::quiet_NaN(), n_threads};
+
+  auto config = ExtMemConfig{tmpdir.path + "cache",
+                             false,
+                             ::xgboost::cuda_impl::AutoHostRatio(),
+                             cuda_impl::MatchingPageBytes(),
+                             std::numeric_limits<float>::quiet_NaN(),
+                             n_threads};
   data::SparsePageDMatrix m{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
                             config};
   ASSERT_EQ(AllThreadsForTest(), m.Ctx()->Threads());
@@ -362,8 +367,12 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
   CreateBigTestData(filename, 1 << 16);
 
   data::FileIterator iter(filename + "?format=libsvm", 0, 1);
-  auto config = ExtMemConfig{filename, false, cuda_impl::MatchingPageBytes(),
-                             std::numeric_limits<float>::quiet_NaN(), threads};
+  auto config = ExtMemConfig{filename,
+                             false,
+                             ::xgboost::cuda_impl::AutoHostRatio(),
+                             cuda_impl::MatchingPageBytes(),
+                             std::numeric_limits<float>::quiet_NaN(),
+                             threads};
   std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
       &iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next, config}};
   CHECK(sparse->Ctx()->Threads() == threads || sparse->Ctx()->Threads() == AllThreadsForTest());
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 5e8ce1432edb..51726bb7228f 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -3,6 +3,8 @@
  */
 #include "helpers.h"
 
+#include "../../src/data/batch_utils.h"  // for AutoHostRatio
+
 #include <gtest/gtest.h>
 #include <xgboost/gbm.h>
 #include <xgboost/json.h>
@@ -213,6 +215,13 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
 // Make sure it's compile time constant.
 static_assert(SimpleLCG::max() - SimpleLCG::min());
 
+RandomDataGenerator::RandomDataGenerator(bst_idx_t rows, std::size_t cols, float sparsity)
+    : rows_{rows},
+      cols_{cols},
+      sparsity_{sparsity},
+      lcg_{seed_},
+      cache_host_ratio_{cuda_impl::AutoHostRatio()} {}
+
 void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const {
   RandomDataGenerator{static_cast<bst_idx_t>(p_fmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
       p_fmat->Info().labels.Data());
@@ -451,6 +460,7 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
   auto config = ExtMemConfig{
       prefix,
       this->on_host_,
+      this->cache_host_ratio_,
       this->min_cache_page_bytes_,
       std::numeric_limits<float>::quiet_NaN(),
       Context{}.Threads(),
@@ -500,6 +510,7 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
   auto config = ExtMemConfig{
       prefix,
       this->on_host_,
+      this->cache_host_ratio_,
       this->min_cache_page_bytes_,
       std::numeric_limits<float>::quiet_NaN(),
       Context{}.Threads(),
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index ef3920a20df7..f8ab9b77ebae 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2024, XGBoost contributors
+ * Copyright 2016-2025, XGBoost contributors
  */
 #pragma once
 
@@ -19,6 +19,7 @@
 #include <string>
 #include <vector>
 
+
 #if defined(__CUDACC__)
 #include "../../src/collective/communicator-inl.h"  // for GetRank
 #include "../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
@@ -241,14 +242,14 @@ class RandomDataGenerator {
   bool on_host_{false};
   std::shared_ptr<DMatrix> ref_{nullptr};
   std::int64_t min_cache_page_bytes_{0};
+  float cache_host_ratio_;
 
   Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;
 
   void GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const;
 
  public:
-  RandomDataGenerator(bst_idx_t rows, size_t cols, float sparsity)
-      : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lcg_{seed_} {}
+  RandomDataGenerator(bst_idx_t rows, std::size_t cols, float sparsity);
 
   RandomDataGenerator& Lower(float v) {
     lower_ = v;
@@ -278,6 +279,10 @@ class RandomDataGenerator {
     this->min_cache_page_bytes_ = min_cache_page_bytes;
     return *this;
   }
+  [[nodiscard]] RandomDataGenerator& CacheHostRatio(float cache_host_ratio) {
+    this->cache_host_ratio_ = cache_host_ratio;
+    return *this;
+  }
   RandomDataGenerator& Seed(uint64_t s) {
     seed_ = s;
     lcg_.Seed(seed_);
diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
index b3e7254244b6..95a9d2b242ca 100644
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -235,3 +235,20 @@ def test_invalid_cat_batches() -> None:
 
 def test_uneven_sizes() -> None:
     check_uneven_sizes("cuda")
+
+
+def test_cache_host_ratio() -> None:
+    boosters = []
+    for min_cache_page_bytes in [0, 64, np.iinfo(np.int64).max]:
+        for cache_host_ratio in [0, 0.5, 1.0]:
+            it = tm.IteratorForTest(
+                *tm.make_batches(64, 16, 4, use_cupy=True),
+                cache=None,
+                on_host=True,
+            )
+            Xy = xgb.ExtMemQuantileDMatrix(it, cache_host_ratio=cache_host_ratio)
+            booster = xgb.train({"device": "cuda"}, Xy)
+            boosters.append(booster.save_raw(raw_format="json"))
+
+        for model in boosters[1:]:
+            assert str(model) == str(boosters[0])

From afe556a98af2e84bbe0dbe1a3583a038e2afb0e3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 14 May 2025 13:28:30 +0800
Subject: [PATCH 050/224] Add CUDA stream pool. (#11458)

---
 src/common/cuda_stream_pool.cuh        | 29 ++++++++++++++++++++++++
 src/data/ellpack_page_source.cu        |  9 +++++---
 src/data/ellpack_page_source.h         |  8 ++++++-
 tests/cpp/common/test_cuda_rt_utils.cu | 31 ++++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 src/common/cuda_stream_pool.cuh
 create mode 100644 tests/cpp/common/test_cuda_rt_utils.cu

diff --git a/src/common/cuda_stream_pool.cuh b/src/common/cuda_stream_pool.cuh
new file mode 100644
index 000000000000..5339bb98ed4d
--- /dev/null
+++ b/src/common/cuda_stream_pool.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#pragma once
+#include <atomic>   // for atomic
+#include <cstddef>  // for size_t
+#include <vector>   // for vector
+
+#include "device_helpers.cuh"  // for CUDAStreamView, CUDAStream
+
+namespace xgboost::curt {
+// rmm cuda_stream_pool
+class StreamPool {
+  mutable std::atomic<std::size_t> next_{0};
+  std::vector<dh::CUDAStream> stream_;
+
+ public:
+  explicit StreamPool(std::size_t n) : stream_(n) {}
+  ~StreamPool() = default;
+  StreamPool(StreamPool const& that) = delete;
+  StreamPool& operator=(StreamPool const& that) = delete;
+
+  [[nodiscard]] dh::CUDAStreamView operator[](std::size_t i) const { return stream_[i].View(); }
+  [[nodiscard]] dh::CUDAStreamView Next() const {
+    return stream_[(next_++) % stream_.size()].View();
+  }
+  [[nodiscard]] std::size_t Size() const { return stream_.size(); }
+};
+}  // namespace xgboost::curt
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 690a2b0cca39..383f07374ace 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,6 +10,7 @@
 
 #include "../common/common.h"               // for HumanMemUnit, safe_cuda
 #include "../common/cuda_rt_utils.h"        // for SetDevice
+#include "../common/cuda_stream_pool.cuh"   // for StreamPool
 #include "../common/device_helpers.cuh"     // for CUDAStreamView, DefaultStream
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
 #include "../common/resource.cuh"           // for PrivateCudaMmapConstStream
@@ -25,11 +26,12 @@ namespace xgboost::data {
 /**
  * Cache
  */
-EllpackMemCache::EllpackMemCache(EllpackCacheInfo cinfo)
+EllpackMemCache::EllpackMemCache(EllpackCacheInfo cinfo, std::int32_t n_workers)
     : cache_mapping{std::move(cinfo.cache_mapping)},
       buffer_bytes{std::move(cinfo.buffer_bytes)},
       buffer_rows{std::move(cinfo.buffer_rows)},
-      cache_host_ratio{cinfo.cache_host_ratio} {
+      cache_host_ratio{cinfo.cache_host_ratio},
+      streams{std::make_unique<curt::StreamPool>(n_workers)} {
   CHECK_EQ(buffer_bytes.size(), buffer_rows.size());
   CHECK(!detail::HostRatioIsAuto(this->cache_host_ratio));
   CHECK_GE(this->cache_host_ratio, 0) << error::CacheHostRatioInvalid();
@@ -289,7 +291,8 @@ EllpackCacheStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
     CHECK(!detail::HostRatioIsAuto(this->CacheInfo().cache_host_ratio));
     CHECK_GE(this->CacheInfo().cache_host_ratio, 0.0);
     CHECK_LE(this->CacheInfo().cache_host_ratio, 1.0);
-    this->p_cache_ = std::make_unique<EllpackMemCache>(this->CacheInfo());
+    constexpr std::int32_t kMaxGpuExtMemWorkers = 4;
+    this->p_cache_ = std::make_unique<EllpackMemCache>(this->CacheInfo(), kMaxGpuExtMemWorkers);
   }
   auto fo = std::make_unique<EllpackHostCacheStream>(this->p_cache_);
   if (iter == 0) {
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 681f6e067850..45bfd61d542b 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -23,6 +23,10 @@
 #include "xgboost/data.h"                   // for BatchParam
 #include "xgboost/span.h"                   // for Span
 
+namespace xgboost::curt {
+class StreamPool;
+}
+
 namespace xgboost::data {
 struct EllpackCacheInfo {
   BatchParam param;
@@ -66,7 +70,9 @@ struct EllpackMemCache {
   std::vector<bst_idx_t> const buffer_rows;
   float const cache_host_ratio;
 
-  explicit EllpackMemCache(EllpackCacheInfo cinfo);
+  std::unique_ptr<curt::StreamPool> streams;
+
+  explicit EllpackMemCache(EllpackCacheInfo cinfo, std::int32_t n_workers);
   ~EllpackMemCache();
 
   // The number of bytes of the entire cache.
diff --git a/tests/cpp/common/test_cuda_rt_utils.cu b/tests/cpp/common/test_cuda_rt_utils.cu
new file mode 100644
index 000000000000..192c2cf3366c
--- /dev/null
+++ b/tests/cpp/common/test_cuda_rt_utils.cu
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>  // for int32_t
+#include <set>      // for set
+
+#include "../../../src/common/cuda_stream_pool.cuh"
+
+namespace xgboost::curt {
+TEST(RtUtils, StreamPool) {
+  auto n_streams = 16;
+  auto pool = std::make_unique<StreamPool>(n_streams);
+  std::set<cudaStream_t> hdls;
+
+  for (std::int32_t i = 0; i < n_streams; ++i) {
+    hdls.insert(cudaStream_t{pool->Next()});
+  }
+
+  ASSERT_EQ(hdls.size(), n_streams);
+  ASSERT_EQ(hdls.size(), pool->Size());
+
+  for (std::int32_t i = 0; i < n_streams; ++i) {
+    hdls.insert(cudaStream_t{pool->Next()});
+  }
+  ASSERT_EQ(hdls.size(), n_streams);
+  ASSERT_EQ(hdls.size(), pool->Size());
+}
+}  // namespace xgboost::curt

From dde27723fbf81e196a1a240bb01b198b1b3ada27 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 15 May 2025 00:19:00 +0800
Subject: [PATCH 051/224] Reduce flakiness of the pyspark conf test. (#11456)

---
 python-package/xgboost/testing/collective.py      | 15 +++++++++++++++
 .../test_with_spark/test_spark_local.py           |  6 ++++--
 2 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 python-package/xgboost/testing/collective.py

diff --git a/python-package/xgboost/testing/collective.py b/python-package/xgboost/testing/collective.py
new file mode 100644
index 000000000000..f3eaea0b52be
--- /dev/null
+++ b/python-package/xgboost/testing/collective.py
@@ -0,0 +1,15 @@
+"""Collective module related utilities."""
+
+import socket
+
+
+def get_avail_port() -> int:
+    """Returns a port that's available during the function call. It doesn't prevent the
+    port from being used after the function returns as we can't reserve the port. The
+    utility makes a test more likely to pass.
+
+    """
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server:
+        server.bind(("127.0.0.1", 0))
+        port = server.getsockname()[1]
+    return port
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 2cdafffaae6e..6d1ef971da16 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -31,6 +31,7 @@
 )
 from xgboost.spark.core import _non_booster_params
 from xgboost.spark.data import pred_contribs
+from xgboost.testing.collective import get_avail_port
 
 from .utils import SparkTestCase
 
@@ -1772,16 +1773,17 @@ def test_collective_conf(self):
 
         with tempfile.TemporaryDirectory() as tmpdir:
             path = "file:" + tmpdir
+            port = get_avail_port()
             classifier = SparkXGBClassifier(
                 launch_tracker_on_driver=True,
-                coll_cfg=Config(tracker_host_ip="127.0.0.1", tracker_port=58894),
+                coll_cfg=Config(tracker_host_ip="127.0.0.1", tracker_port=port),
                 num_workers=1,
                 n_estimators=1,
             )
 
             def check_conf(conf: Config) -> None:
                 assert conf.tracker_host_ip == "127.0.0.1"
-                assert conf.tracker_port == 58894
+                assert conf.tracker_port == port
 
             check_conf(classifier.getOrDefault(classifier.coll_cfg))
             classifier.write().overwrite().save(path)

From 9ad4e2445bb2fa873f63d141b90d453ef61b7e32 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 15 May 2025 01:23:31 +0800
Subject: [PATCH 052/224] Optional support for nvcomp. (#11460)

No integration yet. This PR implements basic compression and decompression.
---
 CMakeLists.txt                              |  12 +
 cmake/Utils.cmake                           |   8 +
 src/common/common.h                         |  10 +-
 src/common/cuda_dr_utils.cc                 |   4 +-
 src/common/cuda_dr_utils.h                  |  37 +-
 src/common/cuda_pinned_allocator.cu         |   5 +-
 src/common/device_compression.cu            | 477 ++++++++++++++++++++
 src/common/device_compression.cuh           | 108 +++++
 src/common/device_compression.h             | 129 ++++++
 src/common/device_helpers.cuh               |  49 +-
 src/common/ref_resource_view.cuh            |   3 +
 src/common/ref_resource_view.h              |   2 +
 tests/cpp/common/test_device_compression.cu |  98 ++++
 13 files changed, 923 insertions(+), 19 deletions(-)
 create mode 100644 src/common/device_compression.cu
 create mode 100644 src/common/device_compression.cuh
 create mode 100644 src/common/device_compression.h
 create mode 100644 tests/cpp/common/test_device_compression.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a7542d61bb9..579f89acffd7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binar
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
+option(USE_NVCOMP "Build with nvcomp to enable sparse data compression. (experimental)" OFF)
 # This is specifically designed for PyPI binary release and should be disabled for most of the cases.
 option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
@@ -124,6 +125,9 @@ endif()
 if(USE_NCCL AND (NOT USE_CUDA))
   message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
 endif()
+if(USE_NVCOMP AND (NOT USE_CUDA))
+  message(SEND_ERROR "`USE_NVCOMP` must be enabled with `USE_CUDA` flag.")
+endif()
 if(USE_DEVICE_DEBUG AND (NOT USE_CUDA))
   message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
 endif()
@@ -234,6 +238,14 @@ if(USE_CUDA)
   find_package(CUDAToolkit REQUIRED)
 endif()
 
+if(USE_NVCOMP)
+  find_package(nvcomp REQUIRED)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 12.8)
+    message(SEND_ERROR "NVComp support requires CUDA >= 12.8")
+  endif()
+endif()
+
+
 if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
     ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
       (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 246bf7eeaf60..180b33f15f12 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -236,6 +236,10 @@ macro(xgboost_target_defs target)
   if(PLUGIN_RMM)
     target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
   endif()
+
+  if(USE_NVCOMP)
+    target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_NVCOMP=1)
+  endif()
 endmacro()
 
 # handles dependencies
@@ -262,6 +266,10 @@ macro(xgboost_target_link_libraries target)
     target_link_libraries(${target} PRIVATE rmm::rmm)
   endif()
 
+  if(USE_NVCOMP)
+    target_link_libraries(${target} PRIVATE nvcomp::nvcomp)
+  endif()
+
   if(USE_NCCL)
     xgboost_link_nccl(${target})
   endif()
diff --git a/src/common/common.h b/src/common/common.h
index dfb30ad05d8b..57d98cdb0d0b 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -178,13 +178,19 @@ class Range {
 
 inline void AssertGPUSupport() {
 #ifndef XGBOOST_USE_CUDA
-    LOG(FATAL) << "XGBoost version not compiled with GPU support.";
+  LOG(FATAL) << "XGBoost version not compiled with GPU support.";
+#endif  // XGBOOST_USE_CUDA
+}
+
+inline void AssertNvCompSupport() {
+#ifndef XGBOOST_USE_NVCOMP
+  LOG(FATAL) << "XGBoost is not compiled with NVCOMP support.";
 #endif  // XGBOOST_USE_CUDA
 }
 
 inline void AssertNCCLSupport() {
 #if !defined(XGBOOST_USE_NCCL)
-    LOG(FATAL) << "XGBoost version not compiled with NCCL support.";
+  LOG(FATAL) << "XGBoost version not compiled with NCCL support.";
 #endif  // !defined(XGBOOST_USE_NCCL)
 }
 
diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index d91daa31c2e0..3974c77b17c5 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -40,7 +40,9 @@ CuDriverApi::CuDriverApi() {
   safe_load("cuGetErrorName", &this->cuGetErrorName);
   safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
   safe_load("cuDeviceGet", &this->cuDeviceGet);
-
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+  safe_load("cuMemBatchDecompressAsync", &this->cuMemBatchDecompressAsync);
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
   CHECK(this->cuMemGetAllocationGranularity);
 }
 
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
index 2c124072f1f8..8bf6d21fe98a 100644
--- a/src/common/cuda_dr_utils.h
+++ b/src/common/cuda_dr_utils.h
@@ -15,6 +15,10 @@
 
 #include "xgboost/string_view.h"  // for StringView
 
+#if CUDART_VERSION >= 12080
+#define CUDA_HW_DECOM_AVAILABLE 1
+#endif
+
 namespace xgboost::cudr {
 /**
  * @brief A struct for retrieving CUDA driver API from the runtime API.
@@ -44,12 +48,17 @@ struct CuDriverApi {
   using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
   using DeviceGet = CUresult(CUdevice *device, int ordinal);
 
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+  using BatchDecompressAsync = CUresult(CUmemDecompressParams *paramsArray, size_t count,
+                                        unsigned int flags, size_t *errorIndex, CUstream stream);
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+
   MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr};  // NOLINT
   MemCreateFn *cuMemCreate{nullptr};                                      // NOLINT
   /**
    * @param[in] offset - Must be zero.
    */
-  MemMapFn *cuMemMap{nullptr};                                            // NOLINT
+  MemMapFn *cuMemMap{nullptr};  // NOLINT
   /**
    * @param[out] ptr       - Resulting pointer to start of virtual address range allocated
    * @param[in]  size      - Size of the reserved virtual address range requested
@@ -57,15 +66,21 @@ struct CuDriverApi {
    * @param[in]  addr      - Fixed starting address range requested
    * @param[in]  flags     - Currently unused, must be zero
    */
-  MemAddressReserveFn *cuMemAddressReserve{nullptr};  // NOLINT
-  MemSetAccessFn *cuMemSetAccess{nullptr};            // NOLINT
-  MemUnmapFn *cuMemUnmap{nullptr};                    // NOLINT
-  MemReleaseFn *cuMemRelease{nullptr};                // NOLINT
-  MemAddressFreeFn *cuMemAddressFree{nullptr};        // NOLINT
-  GetErrorString *cuGetErrorString{nullptr};          // NOLINT
-  GetErrorName *cuGetErrorName{nullptr};              // NOLINT
-  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};  // NOLINT
-  DeviceGet *cuDeviceGet{nullptr};                    // NOLINT
+  MemAddressReserveFn *cuMemAddressReserve{nullptr};      // NOLINT
+  MemSetAccessFn *cuMemSetAccess{nullptr};                // NOLINT
+  MemUnmapFn *cuMemUnmap{nullptr};                        // NOLINT
+  MemReleaseFn *cuMemRelease{nullptr};                    // NOLINT
+  MemAddressFreeFn *cuMemAddressFree{nullptr};            // NOLINT
+  GetErrorString *cuGetErrorString{nullptr};              // NOLINT
+  GetErrorName *cuGetErrorName{nullptr};                  // NOLINT
+  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};      // NOLINT
+  DeviceGet *cuDeviceGet{nullptr};                        // NOLINT
+
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+
+  BatchDecompressAsync *cuMemBatchDecompressAsync{nullptr};  // NOLINT
+
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
 
   CuDriverApi();
 
@@ -96,7 +111,7 @@ inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
 /**
  * @brief Obtain appropriate device ordinal for `CUmemLocation`.
  */
-void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc);
 
 /**
  * @brief Construct a `CUmemAllocationProp`.
diff --git a/src/common/cuda_pinned_allocator.cu b/src/common/cuda_pinned_allocator.cu
index e67dec64ae3c..76ae204dd74f 100644
--- a/src/common/cuda_pinned_allocator.cu
+++ b/src/common/cuda_pinned_allocator.cu
@@ -14,12 +14,9 @@
 #endif  // defined(XGBOOST_USE_CUDA)
 
 #include "common.h"
+#include "cuda_dr_utils.h"  // for CUDA_HW_DECOM_AVAILABLE
 #include "cuda_rt_utils.h"  // for CurrentDevice
 
-#if CUDART_VERSION >= 12080
-#define CUDA_HW_DECOM_AVAILABLE 1
-#endif
-
 namespace xgboost::common::cuda_impl {
 [[nodiscard]] MemPoolHdl CreateHostMemPool() {
   auto mem_pool = std::unique_ptr<cudaMemPool_t, void (*)(cudaMemPool_t*)>{
diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu
new file mode 100644
index 000000000000..c8f591222b7b
--- /dev/null
+++ b/src/common/device_compression.cu
@@ -0,0 +1,477 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ *
+ * We use NVComp to perform compression and access the DE API directly for
+ * decompression. Invoking the DE directly can help us avoid unnecessary kernal launches
+ * and CUDA API calls and any potential blocking behaviours.
+ */
+
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t, uint32_t, int32_t
+#include <memory>   // for shared_ptr
+
+#include "device_compression.cuh"
+#include "device_helpers.cuh"  // for CUDAStreamView, MemcpyBatchAsync
+#include "xgboost/span.h"      // for Span
+
+#if defined(XGBOOST_USE_NVCOMP)
+
+#include <nvcomp/snappy.h>   // for nvcompBatchedSnappyDecompressAsync
+#include <thrust/logical.h>  // for all_of
+#include <thrust/reduce.h>   // for reduce
+
+#include <algorithm>  // for transform, min
+#include <cstring>    // for memset
+#include <mutex>      // for once_flag, call_once
+#include <vector>     // for vector
+
+#include "compressed_iterator.h"    // for CompressedByteT
+#include "cuda_context.cuh"         // for CUDAContext
+#include "cuda_dr_utils.h"          // for GetGlobalCuDriverApi
+#include "cuda_pinned_allocator.h"  // for HostPinnedMemPool
+#include "device_compression.h"
+#include "device_vector.cuh"      // for DeviceUVector
+#include "nvtx_utils.h"           // for xgboost_NVTX_FN_RANGE
+#include "ref_resource_view.cuh"  // for MakeFixedVecWithPinnedMemPool
+#include "ref_resource_view.h"    // for RefResourceView
+
+namespace xgboost::dc {
+namespace {
+// Parse snappy header
+XGBOOST_DEVICE std::uint32_t GetUncompressedSize(std::uint8_t const* src, std::size_t src_bytes,
+                                                 std::uint32_t* p_header_nbytes,
+                                                 std::int32_t* p_status) {
+  auto& n_bytes = *p_header_nbytes;
+  n_bytes = 0;
+
+  *p_status = 1;
+  std::uint32_t uncompressed_size = src[n_bytes++];
+  if (uncompressed_size > 0x7f) {
+    std::uint32_t c = (n_bytes < src_bytes) ? src[n_bytes++] : 0;
+    uncompressed_size = (uncompressed_size & 0x7f) | (c << 7);
+    if (uncompressed_size >= (0x80 << 7)) {
+      c = (n_bytes < src_bytes) ? src[n_bytes++] : 0;
+      uncompressed_size = (uncompressed_size & ((0x7f << 7) | 0x7f)) | (c << 14);
+      if (uncompressed_size >= (0x80 << 14)) {
+        c = (n_bytes < src_bytes) ? src[n_bytes++] : 0;
+        uncompressed_size = (uncompressed_size & ((0x7f << 14) | (0x7f << 7) | 0x7f)) | (c << 21);
+        if (uncompressed_size >= (0x80 << 21)) {
+          c = (n_bytes < src_bytes) ? src[n_bytes++] : 0;
+          if (c < 0x8) {
+            uncompressed_size =
+                (uncompressed_size & ((0x7f << 21) | (0x7f << 14) | (0x7f << 7) | 0x7f)) |
+                (c << 28);
+          } else {
+            *p_status = 0;
+          }
+        }
+      }
+    }
+  }
+
+  return uncompressed_size;
+}
+
+void FillDecompParams(void const* const* d_in_chunk_ptrs, std::size_t const* d_in_chunk_nbytes,
+                      common::Span<CUmemDecompressParams> de_params, size_t* d_act_nbytes,
+                      std::size_t const* d_out_chunk_nbytes, std::int32_t* statuses,
+                      dh::CUDAStreamView stream) {
+  auto n_chunks = de_params.size();
+  dh::LaunchN(n_chunks, stream,
+              [d_in_chunk_ptrs, d_in_chunk_nbytes, d_out_chunk_nbytes, d_act_nbytes, de_params,
+               statuses, n_chunks] XGBOOST_DEVICE(std::size_t ix_chunk) {
+                std::size_t const dev_in_bytes = d_in_chunk_nbytes[ix_chunk];
+
+                // Parse the input buffer to determine the number of bytes to skip
+                // First byte with a 0 msb indicates no more bytes in the header
+                auto cur = reinterpret_cast<std::uint8_t const*>(d_in_chunk_ptrs[ix_chunk]);
+                std::uint32_t header_nbytes = 0;
+                std::uint32_t uncompressed_size =
+                    GetUncompressedSize(cur, dev_in_bytes, &header_nbytes, &statuses[ix_chunk]);
+                if (statuses[ix_chunk] == 0) {
+                  return;
+                }
+
+                de_params[ix_chunk].src = reinterpret_cast<const void*>(cur + header_nbytes);
+                de_params[ix_chunk].dst = nullptr;  // not know yet
+                de_params[ix_chunk].dstNumBytes = d_out_chunk_nbytes[ix_chunk];
+                d_act_nbytes[ix_chunk] = 0;
+                de_params[ix_chunk].dstActBytes =
+                    reinterpret_cast<cuuint32_t*>(&d_act_nbytes[ix_chunk]);
+                de_params[ix_chunk].srcNumBytes = dev_in_bytes - header_nbytes;
+                de_params[ix_chunk].algo = CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY;
+                statuses[ix_chunk] = 1;
+              });
+}
+
+struct ChkOp {
+  XGBOOST_DEVICE bool operator()(int s) { return s == 1; }
+};
+
+void CheckAlign(nvcompAlignmentRequirements_t alignment) {
+  CHECK_EQ(alignment.input, 1);
+  CHECK_EQ(alignment.output, 1);
+  CHECK_EQ(alignment.temp, 1);
+}
+
+void SafeNvComp(nvcompStatus_t status) {
+  if (status != nvcompSuccess) {
+    LOG(FATAL) << "NVComp error:" << static_cast<std::int32_t>(status);
+  }
+}
+}  // namespace
+
+[[nodiscard]] DeStatus const& GetGlobalDeStatus() {
+  std::once_flag static flag;
+  DeStatus static de;
+  std::call_once(flag, [&] {
+    // First check driver, we don't need to worry about mismatched libcuda version and rm
+    // version here. The first DE-enabled GPU requires >= 12.8 to work.
+    std::int32_t driver_version = 0;
+    dh::safe_cuda(cudaDriverGetVersion(&driver_version));
+    if (driver_version < 12080) {
+      return;
+    }
+
+    // Then check HW
+    auto device = curt::CurrentDevice();
+    std::int32_t mask = 0;
+    safe_cu(cudr::GetGlobalCuDriverApi().cuDeviceGetAttribute(
+        &mask, CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK, device));
+    de.avail = static_cast<bool>(mask);
+    if (!de.avail) {
+      return;
+    }
+
+    std::int32_t max_supported_size = 0;
+    // this refers to the output length of the decomp
+    safe_cu(cudr::GetGlobalCuDriverApi().cuDeviceGetAttribute(
+        &max_supported_size, CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH, device));
+    de.max_output_size = static_cast<std::size_t>(max_supported_size);
+    LOG(INFO) << "The maximum supported size of the DE:" << max_supported_size << std::endl;
+  });
+
+  return de;
+}
+
+SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(
+    dh::CUDAStreamView s, std::shared_ptr<common::cuda_impl::HostPinnedMemPool> pool,
+    CuMemParams params, common::Span<std::uint8_t const> in_compressed_data) {
+  std::size_t n_chunks = params.size();
+  if (n_chunks == 0) {
+    return;
+  }
+
+  std::size_t last_in = 0, last_out = 0;
+
+  std::vector<void const*> in_chunk_ptrs(n_chunks);
+  std::vector<std::size_t> in_chunk_sizes(n_chunks);
+  std::vector<std::size_t> out_chunk_sizes(n_chunks);
+
+  dh::DeviceUVector<std::int32_t> status(n_chunks);
+  for (std::size_t i = 0; i < n_chunks; ++i) {
+    in_chunk_ptrs[i] = in_compressed_data.subspan(last_in, params[i].src_act_nbytes).data();
+    in_chunk_sizes[i] = params[i].src_act_nbytes;
+    out_chunk_sizes[i] = params[i].dst_nbytes;
+
+    last_in += params[i].src_nbytes;
+    last_out += params[i].dst_nbytes;
+  }
+
+  // copy to d
+  dh::CopyTo(in_chunk_ptrs, &this->d_in_chunk_ptrs, s);
+  dh::CopyTo(in_chunk_sizes, &this->d_in_chunk_sizes, s);
+  dh::CopyTo(out_chunk_sizes, &this->d_out_chunk_sizes, s);
+  this->act_nbytes.resize(n_chunks, 0);
+
+  this->de_params = common::MakeFixedVecWithPinnedMemPool<decltype(this->de_params)::value_type>(
+      pool, n_chunks, s);
+  for (std::size_t i = 0; i < n_chunks; ++i) {
+    std::memset(this->de_params.data() + i, 0, sizeof(CUmemDecompressParams));
+  }
+
+  FillDecompParams(d_in_chunk_ptrs.data().get(), d_in_chunk_sizes.data().get(), de_params.ToSpan(),
+                   this->act_nbytes.data().get(), d_out_chunk_sizes.data().get(), status.data(), s);
+  dh::XGBCachingDeviceAllocator<char> alloc;
+  bool valid = thrust::all_of(thrust::cuda::par_nosync(alloc).on(s), status.cbegin(), status.cend(),
+                              ChkOp{});
+  CHECK(valid);
+
+  auto max_supported_size = GetGlobalDeStatus().max_output_size;
+  auto max_chunk_size = *std::max_element(out_chunk_sizes.cbegin(), out_chunk_sizes.cend());
+  if (GetGlobalDeStatus().avail) {
+    CHECK_GE(max_supported_size, max_chunk_size);
+  }
+
+  this->de_params_copy =
+      common::MakeFixedVecWithPinnedMemPool<decltype(this->de_params)::value_type>(pool, n_chunks,
+                                                                                   s);
+}
+
+common::Span<CUmemDecompressParams> SnappyDecomprMgrImpl::GetParams(
+    common::Span<common::CompressedByteT> out) {
+  if (this->de_params.empty()) {
+    return {};
+  }
+  auto n_chunks = this->de_params.size();
+  CHECK(!this->de_params_copy.empty());
+  // Set the output buffers.
+  std::size_t last_out = 0;
+  for (std::size_t i = 0; i < n_chunks; ++i) {
+    this->de_params_copy[i].dst = out.subspan(last_out, de_params[i].dstNumBytes).data();
+    last_out += de_params[i].dstNumBytes;
+  }
+
+  return this->de_params_copy.ToSpan();
+}
+
+SnappyDecomprMgr::SnappyDecomprMgr() : pimpl_{std::make_unique<SnappyDecomprMgrImpl>()} {}
+SnappyDecomprMgr::SnappyDecomprMgr(SnappyDecomprMgr&& that) = default;
+SnappyDecomprMgr& SnappyDecomprMgr::operator=(SnappyDecomprMgr&& that) = default;
+
+SnappyDecomprMgr::~SnappyDecomprMgr() = default;
+
+SnappyDecomprMgrImpl* SnappyDecomprMgr::Impl() const { return this->pimpl_.get(); }
+
+void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
+                      common::Span<common::CompressedByteT> out, bool allow_fallback) {
+  xgboost_NVTX_FN_RANGE();
+  auto mgr_impl = mgr.Impl();
+  auto params = mgr_impl->GetParams(out);
+  if (params.empty()) {
+    CHECK(out.empty());
+    return;
+  }
+  if (GetGlobalDeStatus().avail) {
+    // Invoke the DE.
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+    std::size_t error_index;
+    safe_cu(cudr::GetGlobalCuDriverApi().cuMemBatchDecompressAsync(
+        params.data(), params.size(), 0 /*unused*/, &error_index, stream));
+#else
+    static_assert(false, "`cuMemBatchDecompressAsync` requires CUDA >= 12.8.")
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+  } else {
+    // Fallback to nvcomp. This is only used during tests where we don't have access to DE
+    // but still want the test coverage.
+    CHECK(allow_fallback);
+    CheckAlign(nvcompBatchedSnappyDecompressRequiredAlignments);
+    auto n_chunks = mgr_impl->Chunks();
+    // Get sketch space
+    std::size_t n_tmp_bytes = 0;
+    SafeNvComp(nvcompBatchedSnappyDecompressGetTempSize(n_chunks, /*unused*/ 0, &n_tmp_bytes));
+    dh::device_vector<char> tmp(n_tmp_bytes, 0);
+
+    dh::device_vector<nvcompStatus_t> status(n_chunks, nvcompSuccess);
+
+    // Build output vector
+    std::vector<void*> h_out_ptrs(n_chunks);
+    std::transform(params.cbegin(), params.cend(), h_out_ptrs.begin(),
+                   [](auto const& p) { return p.dst; });
+    dh::device_vector<void*> d_out_ptrs(n_chunks);
+    dh::safe_cuda(cudaMemcpyAsync(d_out_ptrs.data().get(), h_out_ptrs.data(),
+                                  dh::ToSpan(d_out_ptrs).size_bytes(), cudaMemcpyDefault, stream));
+
+    // Run nvcomp
+    SafeNvComp(nvcompBatchedSnappyDecompressAsync(
+        mgr_impl->d_in_chunk_ptrs.data().get(), mgr_impl->d_in_chunk_sizes.data().get(),
+        mgr_impl->d_out_chunk_sizes.data().get(), mgr_impl->act_nbytes.data().get(), n_chunks,
+        tmp.data().get(), n_tmp_bytes, d_out_ptrs.data().get(), status.data().get(), stream));
+  }
+}
+
+[[nodiscard]] CuMemParams CompressSnappy(Context const* ctx,
+                                         common::Span<common::CompressedByteT const> in,
+                                         dh::DeviceUVector<std::uint8_t>* p_out,
+                                         std::size_t chunk_size) {
+  CHECK_GT(chunk_size, 0);
+  auto cuctx = ctx->CUDACtx();
+  auto nvcomp_batched_snappy_opts = nvcompBatchedSnappyDefaultOpts;
+
+  nvcompAlignmentRequirements_t compression_alignment_reqs;
+  SafeNvComp(nvcompBatchedSnappyCompressGetRequiredAlignments(nvcomp_batched_snappy_opts,
+                                                              &compression_alignment_reqs));
+  CheckAlign(compression_alignment_reqs);
+
+  /**
+   * Inputs
+   */
+  std::size_t n_chunks = (in.size() + chunk_size - 1) / chunk_size;
+  if (n_chunks == 0) {
+    p_out->clear();
+    return {};
+  }
+  std::size_t last = 0;
+
+  std::vector<common::CompressedByteT const*> h_in_ptrs(n_chunks);
+  std::vector<std::size_t> h_in_sizes(n_chunks);
+  for (std::size_t i = 0; i < n_chunks; ++i) {
+    auto n = std::min(chunk_size, in.size() - last);
+    auto chunk = in.subspan(last, n);
+    last += n;
+
+    h_in_sizes[i] = chunk.size();
+    h_in_ptrs[i] = chunk.data();
+  }
+  CHECK_EQ(last, in.size());
+
+  dh::DeviceUVector<void const*> in_ptrs(h_in_ptrs.size());
+  dh::safe_cuda(cudaMemcpyAsync(in_ptrs.data(), h_in_ptrs.data(),
+                                common::Span{h_in_ptrs}.size_bytes(), cudaMemcpyDefault,
+                                cuctx->Stream()));
+  dh::DeviceUVector<std::size_t> in_sizes(h_in_sizes.size());
+  dh::safe_cuda(cudaMemcpyAsync(in_sizes.data(), h_in_sizes.data(),
+                                common::Span{h_in_sizes}.size_bytes(), cudaMemcpyDefault,
+                                cuctx->Stream()));
+
+  CHECK_EQ(n_chunks, in_sizes.size());
+  std::size_t max_in_nbytes = *std::max_element(h_in_sizes.cbegin(), h_in_sizes.cend());
+
+  /**
+   * Outputs
+   */
+  std::size_t comp_temp_bytes;
+  SafeNvComp(nvcompBatchedSnappyCompressGetTempSize(n_chunks, chunk_size,
+                                                    nvcomp_batched_snappy_opts, &comp_temp_bytes));
+  CHECK_EQ(comp_temp_bytes, 0);
+  dh::DeviceUVector<char> comp_tmp(comp_temp_bytes);
+
+  std::size_t max_out_nbytes = 0;
+  SafeNvComp(nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+      std::min(max_in_nbytes, chunk_size), nvcomp_batched_snappy_opts, &max_out_nbytes));
+  p_out->resize(max_out_nbytes * n_chunks);
+  std::vector<void*> h_out_ptrs(n_chunks);
+  std::vector<std::size_t> h_out_sizes(n_chunks);
+  auto s_out = dh::ToSpan(*p_out);
+  for (std::size_t i = 0; i < n_chunks; ++i) {
+    auto chunk = s_out.subspan(max_out_nbytes * i, max_out_nbytes);
+    h_out_ptrs[i] = chunk.data();
+    h_out_sizes[i] = chunk.size();
+  }
+  dh::DeviceUVector<void*> out_ptrs(h_out_ptrs.size());
+  dh::safe_cuda(cudaMemcpyAsync(out_ptrs.data(), h_out_ptrs.data(),
+                                common::Span{h_out_ptrs}.size_bytes(), cudaMemcpyDefault));
+  dh::DeviceUVector<std::size_t> out_sizes(h_out_sizes.size());
+  dh::safe_cuda(cudaMemcpyAsync(out_sizes.data(), h_out_sizes.data(),
+                                common::Span{h_out_sizes}.size_bytes(), cudaMemcpyDefault));
+
+  /**
+   * Compress
+   */
+  SafeNvComp(nvcompBatchedSnappyCompressAsync(
+      in_ptrs.data(), in_sizes.data(), max_in_nbytes, n_chunks, comp_tmp.data(), comp_temp_bytes,
+      out_ptrs.data(), out_sizes.data(), nvcomp_batched_snappy_opts, cuctx->Stream()));
+  auto n_bytes = thrust::reduce(cuctx->CTP(), out_sizes.cbegin(), out_sizes.cend());
+  auto n_total_bytes = p_out->size();
+  auto ratio = static_cast<double>(n_total_bytes) / in.size_bytes();
+  LOG(DEBUG) << "[snappy] Input: " << in.size_bytes() << ", need:" << n_bytes
+             << " allocated:" << n_total_bytes << " ratio:" << ratio;
+
+  /**
+   * Meta
+   */
+  CuMemParams params(n_chunks);
+  std::vector<std::size_t> h_act_nbytes(out_sizes.size());
+  dh::safe_cuda(cudaMemcpyAsync(h_act_nbytes.data(), out_sizes.data(),
+                                common::Span{h_out_sizes}.size_bytes(), cudaMemcpyDefault,
+                                cuctx->Stream()));
+  for (std::size_t i = 0; i < n_chunks; ++i) {
+    auto& p = params[i];
+    p.src_nbytes = h_out_sizes[i];
+    p.src_act_nbytes = h_act_nbytes[i];
+    p.dst_nbytes = h_in_sizes[i];
+    p.algo = ComprParam::kSnappy;
+  }
+  return params;
+}
+
+[[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
+    dh::CUDAStreamView stream, CuMemParams const& in_params,
+    dh::DeviceUVector<std::uint8_t> const& in_buf, CuMemParams* p_out) {
+  std::size_t n_total_act_bytes = in_params.TotalSrcActBytes();
+  std::size_t n_total_bytes = in_params.TotalSrcBytes();
+  if (n_total_bytes == 0) {
+    CHECK_EQ(n_total_act_bytes, 0);
+    p_out->resize(0);
+    return {};
+  }
+  // copy from device buffer to the host cache.
+  CHECK_EQ(n_total_bytes, in_buf.size());
+  auto c_page =
+      common::MakeFixedVecWithPinnedMalloc<std::remove_reference_t<decltype(in_buf)>::value_type>(
+          n_total_act_bytes);
+  std::vector<std::uint8_t const*> srcs(in_params.size());
+  std::vector<std::uint8_t*> dsts(in_params.size());
+  std::vector<std::size_t> sizes(in_params.size());
+
+  decltype(srcs)::value_type sptr = in_buf.data();
+  decltype(dsts)::value_type dptr = c_page.data();
+
+  for (std::size_t i = 0; i < in_params.size(); ++i) {
+    CHECK_LE(in_params[i].src_act_nbytes, in_params[i].src_nbytes);
+    sizes[i] = in_params[i].src_act_nbytes;
+
+    srcs[i] = sptr;
+    dsts[i] = dptr;
+
+    sptr += in_params[i].src_nbytes;
+    dptr += in_params[i].src_act_nbytes;
+  }
+  std::size_t fail_idx = 0;
+  dh::safe_cuda(dh::MemcpyBatchAsync<cudaMemcpyDeviceToHost>(dsts.data(), srcs.data(), sizes.data(),
+                                                             in_params.size(), &fail_idx, stream));
+
+  auto& out_params = *p_out;
+  out_params.resize(in_params.size());
+  for (std::size_t i = 0; i < in_params.size(); ++i) {
+    out_params[i].algo = in_params[i].algo;
+    out_params[i].dst_nbytes = in_params[i].dst_nbytes;
+    out_params[i].src_nbytes = in_params[i].src_act_nbytes;  // change to act
+    out_params[i].src_act_nbytes = in_params[i].src_act_nbytes;
+  }
+  return c_page;
+}
+}  // namespace xgboost::dc
+
+#else
+
+namespace xgboost::dc {
+// Impl
+SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(dh::CUDAStreamView,
+                                           std::shared_ptr<common::cuda_impl::HostPinnedMemPool>,
+                                           CuMemParams,
+                                           common::Span<common::CompressedByteT const>) {}
+
+// SnappyDecomprMgr
+SnappyDecomprMgr::SnappyDecomprMgr() = default;
+SnappyDecomprMgr::SnappyDecomprMgr(SnappyDecomprMgr&& that) = default;
+SnappyDecomprMgr& SnappyDecomprMgr::operator=(SnappyDecomprMgr&& that) = default;
+SnappyDecomprMgr::~SnappyDecomprMgr() = default;
+SnappyDecomprMgrImpl* SnappyDecomprMgr::Impl() const { return nullptr; }
+
+// Round-trip compression
+void DecompressSnappy(dh::CUDAStreamView, SnappyDecomprMgr const&,
+                      common::Span<common::CompressedByteT>, bool) {
+  common::AssertNvCompSupport();
+}
+
+[[nodiscard]] CuMemParams CompressSnappy(Context const*,
+                                         common::Span<common::CompressedByteT const>,
+                                         dh::DeviceUVector<std::uint8_t>*, std::size_t) {
+  common::AssertNvCompSupport();
+  return {};
+}
+
+[[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
+    dh::CUDAStreamView, CuMemParams const&, dh::DeviceUVector<std::uint8_t> const&, CuMemParams*) {
+  common::AssertNvCompSupport();
+  return {};
+}
+
+[[nodiscard]] DeStatus const& GetGlobalDeStatus() {
+  static thread_local DeStatus de;
+  return de;
+}
+}  // namespace xgboost::dc
+
+#endif  // defined(XGBOOST_USE_NVCOMP)
diff --git a/src/common/device_compression.cuh b/src/common/device_compression.cuh
new file mode 100644
index 000000000000..b02df53cd8ce
--- /dev/null
+++ b/src/common/device_compression.cuh
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#pragma once
+
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t
+
+#include "compressed_iterator.h"    // for CompressedByteT
+#include "cuda_dr_utils.h"          // for CUDA_HW_DECOM_AVAILABLE
+#include "cuda_pinned_allocator.h"  // for HostPinnedMemPool
+#include "device_compression.h"     // for CuMemParams
+#include "device_vector.cuh"        // for DeviceUVector
+#include "ref_resource_view.h"      // for RefResourceView
+#include "xgboost/span.h"           // for Span
+
+namespace xgboost::dc {
+/**
+ * @brief Use nvcomp to compress the data.
+ *
+ * @param ctx Context, provides the CUDA stream and execution policy.
+ * @param in  Input buffer, data to be compressed
+ * @param p_out Output buffer, storing comprssed data.
+ * @param chunk_size The number of bytes for each chunk.
+ */
+[[nodiscard]] CuMemParams CompressSnappy(Context const* ctx,
+                                         common::Span<common::CompressedByteT const> in,
+                                         dh::DeviceUVector<std::uint8_t>* p_out,
+                                         std::size_t chunk_size);
+/**
+ * @brief Run decompression with meta data cached in a mgr object.
+ *
+ * @param stream CUDA stream, it should be an asynchronous stream.
+ * @param mgr Cache for decompression-related data.
+ * @param out Pre-allocated output buffer based on the @ref CuMemParams returned from
+ *   compression.
+ * @param allow_fallback Allow fallback to nvcomp implementation if hardware accelerated
+ *   implementation is not available. Used for testing.
+ */
+void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
+                      common::Span<common::CompressedByteT> out, bool allow_fallback);
+
+/**
+ * @brief Coalesce the compressed chunks into a contiguous host pinned buffer.
+ *
+ * @param stream CUDA stream.
+ * @param in_params Params from @ref CompressSnappy, specifies the chunks.
+ * @param in_buf The buffer storing compressed chunks.
+ * @param p_out Re-newed parameters to keep track of the buffers.
+ */
+[[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
+    dh::CUDAStreamView stream, CuMemParams const& in_params,
+    dh::DeviceUVector<std::uint8_t> const& in_buf, CuMemParams* p_out);
+
+// We store decompression parameters in struct of vectors. This is due to nvcomp works
+// with this format. But the CUDA driver works with vector of structs. We can optimize
+// toward the driver decompression function if the overhead is significant (too many
+// chunks).
+struct SnappyDecomprMgrImpl {
+  // src of the CUmemDecompressParams
+  dh::device_vector<void const*> d_in_chunk_ptrs;
+  // srcNumBytes of the CUmemDecompressParams
+  dh::device_vector<std::size_t> d_in_chunk_sizes;
+  // dstNumBytes of the CUmemDecompressParams
+  dh::device_vector<std::size_t> d_out_chunk_sizes;
+  // dstActBytes of the CUmemDecompressParams
+  dh::device_vector<std::size_t> act_nbytes;
+
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+  using DeParams = common::RefResourceView<CUmemDecompressParams>;
+  DeParams de_params;
+  DeParams de_params_copy;
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+
+  [[nodiscard]] std::size_t Chunks() const {
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+    return de_params.size();
+#else
+    LOG(FATAL) << "CUDA >= 12.8 is required.";
+    return 0;
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+  }
+
+  SnappyDecomprMgrImpl(dh::CUDAStreamView s,
+                       std::shared_ptr<common::cuda_impl::HostPinnedMemPool> pool,
+                       CuMemParams params, common::Span<std::uint8_t const> in_compressed_data);
+
+#if defined(CUDA_HW_DECOM_AVAILABLE) && defined(XGBOOST_USE_NVCOMP)
+  common::Span<CUmemDecompressParams> GetParams(common::Span<common::CompressedByteT> out);
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+
+  // big 5
+  SnappyDecomprMgrImpl() = default;
+  SnappyDecomprMgrImpl(SnappyDecomprMgrImpl const& that) = delete;
+  SnappyDecomprMgrImpl(SnappyDecomprMgrImpl&& that) = default;
+  SnappyDecomprMgrImpl& operator=(SnappyDecomprMgrImpl const&) = delete;
+  SnappyDecomprMgrImpl& operator=(SnappyDecomprMgrImpl&&) = default;
+};
+
+inline auto MakeSnappyDecomprMgr(dh::CUDAStreamView s,
+                                 std::shared_ptr<common::cuda_impl::HostPinnedMemPool> pool,
+                                 CuMemParams params,
+                                 common::Span<std::uint8_t const> in_compressed_data) {
+  SnappyDecomprMgr mgr;
+  *mgr.Impl() = SnappyDecomprMgrImpl{s, std::move(pool), std::move(params), in_compressed_data};
+  return mgr;
+}
+}  // namespace xgboost::dc
diff --git a/src/common/device_compression.h b/src/common/device_compression.h
new file mode 100644
index 000000000000..357995098042
--- /dev/null
+++ b/src/common/device_compression.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ *
+ * @brief Implement (de)compression with the help of nvcomp and the HW decompression engine.
+ */
+#pragma once
+
+#include <cstddef>  // for size_t
+#include <numeric>  // for accumulate
+#include <vector>   // for vector
+
+#include "transform_iterator.h"  // for MakeIndexTransformIter
+
+#if defined(XGBOOST_USE_NVCOMP)
+
+#include <memory>  // for unique_ptr
+
+#endif  // defined(XGBOOST_USE_NVCOMP)
+
+namespace xgboost::dc {
+/**
+ * The cuda driver @ref CUmemDecompressParams struct without the pointers. We use this
+ * struct to keep track of various buffer sizes. Naming of member variables follows the
+ * CUDA struct.
+ *
+ * The src_nbytes stores the size of the allocated buffer for compressed data, and the
+ * src_act_nbytes stores the actual size of the compressed data, which must be smaller
+ * than the allocated size (src_nbytes). The nvcomp API over-allocate for compression.
+ */
+struct ComprParam {
+  enum Algo {
+    kLz4 = 0,
+    kGDefalte = 1,
+    kSnappy = 2,  // the only supported one at the moment.
+  };
+
+  // Compressed buffer bytes
+  std::size_t src_nbytes = 0;
+  // Actual compressed bytes
+  std::size_t src_act_nbytes = 0;
+  // Decompressed bytes.
+  std::size_t dst_nbytes = 0;
+  Algo algo;
+};
+
+/**
+ * @brief A wrapper around vector of @ref ComprParam to help manage the chunks.
+ */
+struct CuMemParams {
+  std::vector<ComprParam> params;
+
+  CuMemParams() = default;
+  CuMemParams(CuMemParams const& that) = default;
+  CuMemParams(CuMemParams&& that) = default;
+  CuMemParams& operator=(CuMemParams&& that) = default;
+  CuMemParams& operator=(CuMemParams const& that) = default;
+
+  explicit CuMemParams(std::size_t n_chunks) : params(n_chunks) {}
+
+  ComprParam const& operator[](std::size_t i) const { return this->params[i]; }
+  ComprParam& operator[](std::size_t i) { return this->params[i]; }
+  ComprParam& at(std::size_t i) { return this->params.at(i); }              // NOLINT
+  ComprParam const& at(std::size_t i) const { return this->params.at(i); }  // NOLINT
+  void resize(std::size_t n) { this->params.resize(n); }                    // NOLINT
+
+  [[nodiscard]] auto cbegin() const { return this->params.cbegin(); }  // NOLINT
+  [[nodiscard]] auto cend() const { return this->params.cend(); }      // NOLINT
+
+  [[nodiscard]] auto begin() const { return this->params.begin(); }  // NOLINT
+  [[nodiscard]] auto end() const { return this->params.end(); }      // NOLINT
+  [[nodiscard]] auto begin() { return this->params.begin(); }        // NOLINT
+  [[nodiscard]] auto end() { return this->params.end(); }            // NOLINT
+
+  [[nodiscard]] std::size_t size() const { return this->params.size(); }  // NOLINT
+  [[nodiscard]] bool empty() const { return this->params.empty(); }       // NOLINT
+  [[nodiscard]] auto data() const { return this->params.data(); }         // NOLINT
+
+  [[nodiscard]] std::size_t TotalSrcBytes() const {
+    auto it = common::MakeIndexTransformIter(
+        [this](std::size_t i) { return this->params[i].src_nbytes; });
+    return std::accumulate(it, it + this->size(), static_cast<std::size_t>(0));
+  }
+  [[nodiscard]] std::size_t TotalSrcActBytes() const {
+    auto it = common::MakeIndexTransformIter(
+        [this](std::size_t i) { return this->params[i].src_act_nbytes; });
+    return std::accumulate(it, it + this->size(), static_cast<std::size_t>(0));
+  }
+  [[nodiscard]] std::size_t TotalDstBytes() const {
+    auto it = common::MakeIndexTransformIter(
+        [this](std::size_t i) { return this->params[i].dst_nbytes; });
+    return std::accumulate(it, it + this->size(), static_cast<std::size_t>(0));
+  }
+};
+
+class SnappyDecomprMgrImpl;
+
+/**
+ * @brief Help create and cache all decompression related meta data.
+ *
+ *   This struct is exposed to the CPU code. As a result, it's just a reference to the
+ *   @SnappyDecomprMgrImpl .
+ */
+class SnappyDecomprMgr {
+ public:
+  SnappyDecomprMgr();
+  SnappyDecomprMgr(SnappyDecomprMgr const& that) = delete;
+  SnappyDecomprMgr(SnappyDecomprMgr&& that);
+  SnappyDecomprMgr& operator=(SnappyDecomprMgr const& that) = delete;
+  SnappyDecomprMgr& operator=(SnappyDecomprMgr&& that);
+
+  ~SnappyDecomprMgr();
+
+  SnappyDecomprMgrImpl* Impl() const;
+
+ private:
+  // Hide the CUDA API calls.
+#if defined(XGBOOST_USE_NVCOMP)
+  std::unique_ptr<SnappyDecomprMgrImpl> pimpl_;
+#endif  // defined(XGBOOST_USE_NVCOMP)
+};
+
+struct DeStatus {
+  bool avail{false};               // Whether the DE is present
+  std::size_t max_output_size{0};  // Maximum output size of the buffer
+};
+
+// Get the query result of DE stored in a global variable.
+[[nodiscard]] DeStatus const& GetGlobalDeStatus();
+}  // namespace xgboost::dc
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 5292edbf3591..3f7cb7a14c88 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -20,6 +20,7 @@
 #include <vector>             // for vector
 
 #include "common.h"
+#include "cuda_rt_utils.h"  // for GetNumaId, CurrentDevice
 #include "device_vector.cuh"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
@@ -118,7 +119,7 @@ inline int32_t CurrentDevice() {
 
 // Helper function to get a device from a potentially CPU context.
 inline auto GetDevice(xgboost::Context const *ctx) {
-  auto d = (ctx->IsCUDA()) ? ctx->Device() : xgboost::DeviceOrd::CUDA(dh::CurrentDevice());
+  auto d = (ctx->IsCUDA()) ? ctx->Device() : xgboost::DeviceOrd::CUDA(::xgboost::curt::CurrentDevice());
   CHECK(!d.IsCPU());
   return d;
 }
@@ -811,6 +812,52 @@ void CopyTo(Src const &src, Dst *dst, CUDAStreamView stream = DefaultStream()) {
                                 src.size() * sizeof(SVT), cudaMemcpyDefault, stream));
 }
 
+
+/**
+ * @brief Wrapper for the @ref cudaMemcpyBatchAsync .
+ *
+ * @param dsts Host pointer to a list of device pointers.
+ * @param srcs Host pointer to a list of device pointers.
+ * @param sizes Host pointer to a list of sizes.
+ * @param count How many batches.
+ * @param fail_idx Which batch has failed, if any. When it's assigned to SIZE_MAX, then
+ *   it's a general error.
+ * @param stream CUDA stream. The wrapper enforces stream order access.
+ */
+template <cudaMemcpyKind kind, typename T, typename U>
+[[nodiscard]] cudaError_t MemcpyBatchAsync(T **dsts, U **srcs, std::size_t const *sizes,
+                                           std::size_t count, std::size_t *fail_idx,
+                                           cudaStream_t stream) {
+#if CUDART_VERSION >= 12080
+  static_assert(kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyHostToDevice,
+                "Not implemented.");
+  cudaMemcpyAttributes attr;
+  attr.srcAccessOrder = cudaMemcpySrcAccessOrderStream;
+  attr.flags = cudaMemcpyFlagPreferOverlapWithCompute;
+
+  auto assign_host = [](cudaMemLocation *hint) {
+    hint->type = cudaMemLocationTypeHostNuma;
+    hint->id = xgboost::curt::GetNumaId();
+  };
+  auto assign_device = [](cudaMemLocation *hint) {
+    hint->type = cudaMemLocationTypeDevice;
+    hint->id = xgboost::curt::CurrentDevice();
+  };
+  if constexpr (kind == cudaMemcpyDeviceToHost) {
+    assign_device(&attr.srcLocHint);
+    assign_host(&attr.dstLocHint);
+  } else {
+    assign_host(&attr.srcLocHint);
+    assign_device(&attr.dstLocHint);
+  }
+  return cudaMemcpyBatchAsync(dsts, srcs, const_cast<std::size_t *>(sizes), count, attr, fail_idx,
+                              stream);
+#else
+  LOG(FATAL) << "CUDA >= 12.8 is required.";
+  return cudaErrorInvalidValue;
+#endif  // CUDART_VERSION >= 12080
+}
+
 inline auto CachingThrustPolicy() {
   XGBCachingDeviceAllocator<char> alloc;
 #if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
index 1d87ab15f363..af2ae7d5a07c 100644
--- a/src/common/ref_resource_view.cuh
+++ b/src/common/ref_resource_view.cuh
@@ -47,6 +47,9 @@ template <typename T>
   return ref;
 }
 
+/**
+ * @brief Create a fixed size resource view from a shared pinned memory pool.
+ */
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithPinnedMemPool(
     std::shared_ptr<cuda_impl::HostPinnedMemPool> pool, std::size_t n_elements,
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index 9ba61b4b13b6..7fae75c6fcf3 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -96,6 +96,8 @@ class RefResourceView {
     SPAN_LT(i, this->size_);
     return ptr_[i];
   }
+  [[nodiscard]] Span<std::add_const_t<T>> ToSpan() const { return {this->data(), this->size()}; }
+  [[nodiscard]] Span<T> ToSpan() { return {this->data(), this->size()}; }
 
   /**
    * @brief Get the underlying resource.
diff --git a/tests/cpp/common/test_device_compression.cu b/tests/cpp/common/test_device_compression.cu
new file mode 100644
index 000000000000..9e92cb60c9ad
--- /dev/null
+++ b/tests/cpp/common/test_device_compression.cu
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <thrust/sequence.h>  // for sequence
+
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t
+#include <memory>   // for make_shared
+#include <tuple>    // for tuple
+
+#include "../../../src/common/cuda_context.cuh"         // for CUDAContext
+#include "../../../src/common/cuda_pinned_allocator.h"  // for HostPinnedMemPool
+#include "../../../src/common/device_compression.cuh"
+#include "../../../src/common/device_helpers.cuh"     // for ToSpan
+#include "../../../src/common/device_vector.cuh"      // for DeviceUVector
+#include "../../../src/common/ref_resource_view.cuh"  // for MakeFixedVecWithPinnedMemPool
+#include "../helpers.h"                               // for MakeCUDACtx
+
+namespace xgboost::dc {
+// We skip the tests but keep the code at compilation time if nvcomp is not enabled. This
+// helps us to ensure correct symbol definitions.
+TEST(NvComp, Snappy) {
+#if !defined(XGBOOST_USE_NVCOMP)
+  GTEST_SKIP_("XGBoost is not compiled with nvcomp.");
+#endif
+  auto ctx = MakeCUDACtx(0);
+  auto cuctx = ctx.CUDACtx();
+  dh::DeviceUVector<common::CompressedByteT> in(1024);
+  thrust::sequence(ctx.CUDACtx()->CTP(), in.begin(), in.end(), 0);
+  dh::DeviceUVector<std::uint8_t> compr;
+
+  std::size_t chunk_size = 512;
+  auto params = CompressSnappy(&ctx, dh::ToSpan(in), &compr, chunk_size);
+  ASSERT_GE(params.size(), 1);
+
+  auto pool = std::make_shared<common::cuda_impl::HostPinnedMemPool>();
+  auto h_in =
+      common::MakeFixedVecWithPinnedMemPool<std::uint8_t>(pool, compr.size(), cuctx->Stream());
+  dh::safe_cuda(cudaMemcpyAsync(h_in.data(), compr.data(), compr.size() * sizeof(std::uint8_t),
+                                cudaMemcpyDefault, cuctx->Stream()));
+
+  dh::device_vector<common::CompressedByteT> dout(in.size(), 0);
+  auto mgr = MakeSnappyDecomprMgr(cuctx->Stream(), pool, params, h_in.ToSpan());
+  DecompressSnappy(cuctx->Stream(), mgr, dh::ToSpan(dout), true);
+
+  bool eq = thrust::equal(ctx.CUDACtx()->CTP(), dout.cbegin(), dout.cend(), in.cbegin());
+  ASSERT_TRUE(eq);
+
+  auto const& status = GetGlobalDeStatus();
+  ASSERT_LT(status.max_output_size, 1ul << 24);
+}
+
+class TestNvComp : public ::testing::TestWithParam<std::tuple<std::size_t, std::size_t>> {
+ public:
+  void Run(std::size_t n_bytes, std::size_t n_chunk_bytes) {
+    auto ctx = MakeCUDACtx(0);
+    auto cuctx = ctx.CUDACtx();
+
+    dh::DeviceUVector<common::CompressedByteT> in(n_bytes);
+    thrust::sequence(ctx.CUDACtx()->CTP(), in.begin(), in.end(), 0);
+    dh::DeviceUVector<std::uint8_t> compr;
+
+    auto params = CompressSnappy(&ctx, dh::ToSpan(in), &compr, n_chunk_bytes);
+    if (n_bytes != 0) {
+      ASSERT_GE(params.size(), 1);
+    } else {
+      ASSERT_TRUE(params.empty());
+    }
+    if (n_chunk_bytes < n_bytes) {
+      ASSERT_GE(params.size(), n_bytes / n_chunk_bytes);
+    }
+
+    CuMemParams out_params;
+    auto page = CoalesceCompressedBuffersToHost(cuctx->Stream(), params, compr, &out_params);
+
+    dh::device_vector<common::CompressedByteT> dout(in.size(), 0);
+    auto pool = std::make_shared<common::cuda_impl::HostPinnedMemPool>();
+    auto mgr = MakeSnappyDecomprMgr(cuctx->Stream(), pool, out_params, page.ToSpan());
+    DecompressSnappy(cuctx->Stream(), mgr, dh::ToSpan(dout), true);
+
+    bool eq = thrust::equal(ctx.CUDACtx()->CTP(), dout.cbegin(), dout.cend(), in.cbegin());
+    ASSERT_TRUE(eq);
+  }
+};
+
+TEST_P(TestNvComp, HostBuf) {
+#if !defined(XGBOOST_USE_NVCOMP)
+  GTEST_SKIP_("XGBoost is not compiled with nvcomp.");
+#endif
+  auto [n_bytes, n_chunk_bytes] = this->GetParam();
+  this->Run(n_bytes, n_chunk_bytes);
+}
+
+INSTANTIATE_TEST_SUITE_P(TestNvComp, TestNvComp,
+                         ::testing::Combine(::testing::Values(0, 1, 512, 1024),
+                                            ::testing::Values(1, 256, 512, 1024, 2048)));
+}  // namespace xgboost::dc

From ff56568fb7067ee796b10cc5cf7e046483081bf1 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@users.noreply.github.com>
Date: Thu, 15 May 2025 14:43:30 -0500
Subject: [PATCH 053/224] Dask 2025.4.0 scheduler info compatibility (#11462)

---
 python-package/xgboost/dask/__init__.py | 11 +++--------
 python-package/xgboost/testing/dask.py  |  6 ++++--
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 7f38647faa2f..fdd495549f2f 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -254,15 +254,11 @@ def __init__(self, **args: CollArgsVals) -> None:
         super().__init__(**args)
 
         worker = distributed.get_worker()
-        with distributed.worker_client() as client:
-            info = client.scheduler_info()
-            w = info["workers"][worker.address]
-            wid = w["id"]
         # We use task ID for rank assignment which makes the RABIT rank consistent (but
         # not the same as task ID is string and "10" is sorted before "2") with dask
-        # worker ID. This outsources the rank assignment to dask and prevents
+        # worker name. This outsources the rank assignment to dask and prevents
         # non-deterministic issue.
-        self.args["DMLC_TASK_ID"] = f"[xgboost.dask-{wid}]:" + str(worker.address)
+        self.args["DMLC_TASK_ID"] = f"[xgboost.dask-{worker.name}]:{worker.address}"
 
 
 def _get_client(client: Optional["distributed.Client"]) -> "distributed.Client":
@@ -923,12 +919,11 @@ def train(  # pylint: disable=unused-argument
 
     """
     client = _get_client(client)
-    args = locals()
     return client.sync(
         _train_async,
         global_config=config.get_config(),
         dconfig=_get_dask_config(),
-        **args,
+        **locals(),
     )
 
 
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index 877d1bdf9723..d22bfb560545 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -7,6 +7,7 @@
 from dask import array as da
 from dask import dataframe as dd
 from distributed import Client, get_worker
+from packaging.version import parse as parse_version
 from sklearn.datasets import make_classification
 
 import xgboost as xgb
@@ -15,7 +16,7 @@
 from xgboost.testing.updater import get_basescore
 
 from .. import dask as dxgb
-from ..dask import _get_rabit_args
+from ..dask import _DASK_VERSION, _get_rabit_args
 from .data import make_batches
 from .data import make_categorical as make_cat_local
 
@@ -179,7 +180,8 @@ def get_rabit_args(client: Client, n_workers: int) -> Any:
 
 def get_client_workers(client: Client) -> List[str]:
     "Get workers from a dask client."
-    workers = client.scheduler_info()["workers"]
+    kwargs = {"n_workers": -1} if _DASK_VERSION() >= parse_version("2025.4.0") else {}
+    workers = client.scheduler_info(**kwargs)["workers"]
     return list(workers.keys())
 
 

From bcf0b2085c5a28d4e510a8fbbb13704c6a6aa595 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 16 May 2025 15:23:10 +0800
Subject: [PATCH 054/224] Check nvidia smi version before loading decomp API.
 (#11464)

---
 src/common/cuda_dr_utils.cc      | 38 +++++++++++++++++++++++++++++---
 src/common/cuda_dr_utils.h       | 12 +++++++++-
 src/common/device_compression.cu |  3 ++-
 src/common/device_helpers.cu     | 36 +++++++++++++-----------------
 4 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index 3974c77b17c5..f6bf0b14798c 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -18,7 +18,7 @@
 #include "xgboost/string_view.h"  // for StringView
 
 namespace xgboost::cudr {
-CuDriverApi::CuDriverApi() {
+CuDriverApi::CuDriverApi(std::int32_t cu_major, std::int32_t cu_minor, std::int32_t kdm_major) {
   // similar to dlopen, but without the need to release a handle.
   auto safe_load = [](xgboost::StringView name, auto **fnptr) {
     cudaDriverEntryPointQueryResult status;
@@ -41,7 +41,12 @@ CuDriverApi::CuDriverApi() {
   safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
   safe_load("cuDeviceGet", &this->cuDeviceGet);
 #if defined(CUDA_HW_DECOM_AVAILABLE)
-  safe_load("cuMemBatchDecompressAsync", &this->cuMemBatchDecompressAsync);
+  // CTK 12.8
+  if (((cu_major == 12 && cu_minor >= 8) || cu_major > 12) && (kdm_major >= 570)) {
+    safe_load("cuMemBatchDecompressAsync", &this->cuMemBatchDecompressAsync);
+  } else {
+    this->cuMemBatchDecompressAsync = nullptr;
+  }
 #endif  // defined(CUDA_HW_DECOM_AVAILABLE)
   CHECK(this->cuMemGetAllocationGranularity);
 }
@@ -76,9 +81,17 @@ void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line
 }
 
 [[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
+  std::int32_t cu_major = -1, cu_minor = -1;
+  GetDrVersionGlobal(&cu_major, &cu_minor);
+
+  std::int32_t kdm_major = -1, kdm_minor = -1;
+  if (!GetVersionFromSmiGlobal(&kdm_major, &kdm_minor)) {
+    kdm_major = -1;
+  }
+
   static std::once_flag flag;
   static std::unique_ptr<CuDriverApi> cu;
-  std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
+  std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(cu_major, cu_minor, kdm_major); });
   return *cu;
 }
 
@@ -154,5 +167,24 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
 
   return Invalid();
 }
+
+[[nodiscard]] bool GetVersionFromSmiGlobal(std::int32_t *p_major, std::int32_t *p_minor) {
+  static std::once_flag flag;
+  static std::int32_t major = -1, minor = -1;
+  static bool result = false;
+  std::call_once(flag, [&] { result = GetVersionFromSmi(&major, &minor); });
+
+  *p_major = major;
+  *p_minor = minor;
+  return result;
+}
+
+void GetDrVersionGlobal(std::int32_t *p_major, std::int32_t *p_minor) {
+  static std::once_flag once;
+  static std::int32_t major{0}, minor{0};
+  std::call_once(once, [] { xgboost::curt::DrVersion(&major, &minor); });
+  *p_major = major;
+  *p_minor = minor;
+}
 }  // namespace xgboost::cudr
 #endif
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
index 8bf6d21fe98a..5ce397b1ad40 100644
--- a/src/common/cuda_dr_utils.h
+++ b/src/common/cuda_dr_utils.h
@@ -82,7 +82,7 @@ struct CuDriverApi {
 
 #endif  // defined(CUDA_HW_DECOM_AVAILABLE)
 
-  CuDriverApi();
+  CuDriverApi(std::int32_t cu_major, std::int32_t cu_minor, std::int32_t kdm_major);
 
   void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const;
 };
@@ -124,4 +124,14 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc);
  * @return Whether the system call is successful.
  */
 [[nodiscard]] bool GetVersionFromSmi(std::int32_t *p_major, std::int32_t *p_minor);
+
+/**
+ * @brief Cache the result from @ref GetVersionFromSmi in a global variable
+ */
+[[nodiscard]] bool GetVersionFromSmiGlobal(std::int32_t *p_major, std::int32_t *p_minor);
+
+/**
+ * @brief Cache the result from @ref DrVersion in a global variable
+ */
+void GetDrVersionGlobal(std::int32_t *p_major, std::int32_t *p_minor);
 }  // namespace xgboost::cudr
diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu
index c8f591222b7b..531a9881215e 100644
--- a/src/common/device_compression.cu
+++ b/src/common/device_compression.cu
@@ -242,7 +242,8 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
     CHECK(out.empty());
     return;
   }
-  if (GetGlobalDeStatus().avail) {
+  if (GetGlobalDeStatus().avail &&
+      cudr::GetGlobalCuDriverApi().cuMemBatchDecompressAsync != nullptr) {
     // Invoke the DE.
 #if defined(CUDA_HW_DECOM_AVAILABLE)
     std::size_t error_index;
diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu
index 7a1bea789981..94c1c9d8ce59 100644
--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@@ -1,10 +1,7 @@
 /**
  * Copyright 2024-2025, XGBoost contributors
  */
-#include <mutex>  // for once_flag, call_once
-
 #include "../common/cuda_dr_utils.h"  // for GetVersionFromSmi
-#include "cuda_rt_utils.h"            // for RtVersion
 #include "device_helpers.cuh"
 #include "device_vector.cuh"  // for GrowOnlyVirtualMemVec
 #include "xgboost/windefs.h"  // for xgboost_IS_WIN
@@ -18,25 +15,22 @@ namespace {
 // Check whether cuda virtual memory can be used.
 // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable
 [[nodiscard]] bool CheckVmAlloc() {
-  static bool vm_flag = true;
-  static std::once_flag once;
+  std::int32_t major{0}, minor{0};
+  xgboost::cudr::GetDrVersionGlobal(&major, &minor);
 
-  std::call_once(once, [] {
-    std::int32_t major{0}, minor{0};
-    xgboost::curt::DrVersion(&major, &minor);
-    if (IsSupportedDrVer(major, minor)) {
-      // The result from the driver api is not reliable. The system driver might not match
-      // the CUDA driver in some obscure cases.
-      //
-      // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
-      // Ver                 Linux       Win
-      // CUDA 12.5 Update 1  >=555.42.06 >=555.85
-      // CUDA 12.5 GA        >=555.42.02 >=555.85
-      vm_flag = xgboost::cudr::GetVersionFromSmi(&major, &minor) && major >= 555;
-    } else {
-      vm_flag = false;
-    }
-  });
+  bool vm_flag = true;
+  if (IsSupportedDrVer(major, minor)) {
+    // The result from the driver api is not reliable. The system driver might not match
+    // the CUDA driver in some obscure cases.
+    //
+    // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
+    // Ver                 Linux       Win
+    // CUDA 12.5 Update 1  >=555.42.06 >=555.85
+    // CUDA 12.5 GA        >=555.42.02 >=555.85
+    vm_flag = xgboost::cudr::GetVersionFromSmiGlobal(&major, &minor) && major >= 555;
+  } else {
+    vm_flag = false;
+  }
   return vm_flag;
 }
 }  // namespace

From 94bb1da0422a4f6ba7e34dd5cbee951d26403b67 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 17 May 2025 14:29:09 +0800
Subject: [PATCH 055/224] Implement double-buffer for ellpack. (#11465)

This helps us to have the external memory cache in two different places.

- Make the accessor a variant type.
- Most of the changes are for calling a visit function instead of actual code changes.
---
 .gitignore                                    |   1 +
 include/xgboost/base.h                        |  12 +-
 src/common/compressed_iterator.h              | 120 +++++++++-
 src/data/ellpack_page.cu                      | 222 +++++++++++-------
 src/data/ellpack_page.cuh                     |  82 ++++++-
 src/data/ellpack_page_raw_format.cu           |   5 +-
 src/data/ellpack_page_source.cu               |  15 +-
 src/data/gradient_index.cu                    |  80 +++----
 src/predictor/gpu_predictor.cu                | 106 +++++----
 src/tree/gpu_hist/histogram.cu                |  96 +++++---
 src/tree/gpu_hist/histogram.cuh               |   5 +-
 src/tree/updater_gpu_hist.cu                  | 154 +++++++-----
 .../common/test_gpu_compressed_iterator.cu    | 103 ++++++--
 tests/cpp/data/test_ellpack_page.cu           | 184 ++++++++-------
 .../cpp/data/test_ellpack_page_raw_format.cu  |  24 +-
 .../cpp/data/test_extmem_quantile_dmatrix.cu  |  17 +-
 tests/cpp/data/test_iterative_dmatrix.cu      |  92 +++++---
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  28 ++-
 tests/cpp/tree/gpu_hist/test_histogram.cu     | 135 ++++++++---
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  25 +-
 20 files changed, 1003 insertions(+), 503 deletions(-)

diff --git a/.gitignore b/.gitignore
index c29dcc43d9d3..3e1aaddaef7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ Rplots.pdf
 
 # nsys
 *.nsys-rep
+rmm_log.dev*
\ No newline at end of file
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 4318bd808631..cc51193e56f0 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2024, XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file base.h
  * \brief Defines configuration macros and basic types for xgboost.
  */
@@ -72,6 +72,14 @@
 #define XGBOOST_DEV_INLINE
 #endif  // defined(__CUDA__) || defined(__CUDACC__)
 
+
+// restrict
+#if defined(_MSC_VER)
+#define XGBOOST_RESTRICT __restrict
+#else
+#define XGBOOST_RESTRICT __restrict__
+#endif
+
 // These check are for Makefile.
 #if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
 /* default logic for software pre-fetching */
@@ -122,7 +130,7 @@ using bst_target_t = std::uint32_t;  // NOLINT
  */
 using bst_layer_t = std::int32_t;  // NOLINT
 /**
- * \brief Type for indexing trees.
+ * @brief Type for indexing trees.
  */
 using bst_tree_t = std::int32_t;  // NOLINT
 /**
diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h
index ab5815557f66..4ba8af2d64fa 100644
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -1,12 +1,14 @@
 /**
- * Copyright 2017-2024, XGBoost Contributors
+ * Copyright 2017-2025, XGBoost Contributors
  * \file compressed_iterator.h
  */
 #pragma once
-#include <xgboost/base.h>
+#include <xgboost/base.h>  // for XGBOOST_RESTRICT
 
-#include <cmath>    // for ceil, log2
-#include <cstddef>  // for size_t
+#include <algorithm>  // for max
+#include <cmath>      // for ceil, log2
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t
 
 #include "common.h"
 
@@ -79,7 +81,8 @@ class CompressedBufferWriter {
     size_t ret = std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
                            static_cast<double>(sizeof(std::uint32_t))) *
                  sizeof(std::uint32_t);
-    return ret;
+    // Need at least 5 bytes for the reader
+    return std::max(ret, static_cast<std::size_t>(detail::kPadding + 1));
   }
 
   template <typename T>
@@ -212,4 +215,111 @@ class CompressedIterator {
     return *offset;
   }
 };
+
+/**
+ * @brief A compressed iterator with two buffers for the underlying storage.
+ *
+ * This accessor is significantly slower than the single buffer one due to pipeline
+ * stalling and should not be used as default. Pre-calculating the buffer selection
+ * indicator can help mitigate it. But we only use this iterator for external memory with
+ * direct memory access, which is slow anyway.
+ *
+ * Use the single buffer one as a reference for how it works.
+ */
+template <typename OutT>
+class DoubleCompressedIter {
+ public:
+  // Type definitions for thrust
+  using self_type = DoubleCompressedIter<OutT>;  // NOLINT
+  using difference_type = ptrdiff_t;             // NOLINT
+  using value_type = OutT;                       // NOLINT
+  using pointer = value_type *;                  // NOLINT
+  using reference = value_type;                  // NOLINT
+
+ private:
+  using BufT = CompressedByteT const *;
+  BufT XGBOOST_RESTRICT buf0_{nullptr};
+  BufT XGBOOST_RESTRICT buf1_{nullptr};
+  bst_idx_t const n0_{0};  // Size of the first buffer in bytes.
+  bst_idx_t const symbol_bits_{0};
+  std::size_t offset_{0};
+
+ public:
+  DoubleCompressedIter() = default;
+  DoubleCompressedIter(CompressedByteT const *XGBOOST_RESTRICT buf0, std::size_t n0_bytes,
+                       CompressedByteT const *XGBOOST_RESTRICT buf1, bst_idx_t n_symbols)
+      : buf0_{buf0}, buf1_{buf1}, n0_{n0_bytes}, symbol_bits_{detail::SymbolBits(n_symbols)} {}
+
+  XGBOOST_HOST_DEV_INLINE reference operator*() const {
+    constexpr std::int32_t kBitsPerByte = 8;
+    std::size_t start_bit_idx = ((offset_ + 1) * symbol_bits_ - 1);
+    std::size_t start_byte_idx = start_bit_idx >> 3;
+    start_byte_idx += detail::kPadding;
+
+    std::uint64_t tmp;
+
+    if (start_byte_idx >= this->n0_ && (start_byte_idx - 4) < this->n0_) {
+      // Access between two buffers.
+      auto getv = [&](auto shift) {
+        auto shifted = start_byte_idx - shift;
+        bool ind = (shifted >= n0_);  // indicator for which buffer to read
+        // Pick the buffer to read
+        auto const *XGBOOST_RESTRICT buf = ind ? buf1_ : buf0_;
+        shifted -= ind * n0_;
+        return static_cast<std::uint64_t>(buf[shifted]);
+      };
+      // Read 5 bytes - the maximum we will need
+      tmp = static_cast<std::uint64_t>(buf0_[start_byte_idx - 4]) << 32 | getv(3) << 24 |
+            getv(2) << 16 | getv(1) << 8 | static_cast<std::uint64_t>(buf1_[start_byte_idx - n0_]);
+    } else {
+      // Access one of the buffers
+      bool ind = start_byte_idx >= n0_;
+      // Pick the buffer to read
+      auto const *XGBOOST_RESTRICT buf = reinterpret_cast<CompressedByteT const *>(
+          (!ind) * reinterpret_cast<std::uintptr_t>(buf0_) +
+          ind * reinterpret_cast<std::uintptr_t>(buf1_));
+      auto shifted = start_byte_idx - n0_ * ind;
+
+      /**
+       * Alternatively, we can use vector loads, but it requires aligned memory allocation
+       * by the backing storage.
+       *
+       * // Align the pointer for vector load
+       * auto beg_ptr = buf + shifted - 4;
+       * // base ptr in bytes
+       * auto aligned_beg_ptr = rmm::align_down(reinterpret_cast<std::uintptr_t>(beg_ptr),
+       *                                        std::alignment_of_v<std::uint32_t>);
+       * // base ptr in uint32
+       * auto aligned_beg_u32_ptr = reinterpret_cast<std::uint32_t const *>(aligned_beg_ptr);
+       * // 2 vector loads for 8 bytes, we will need 5 of them
+       * std::uint64_t v;
+       * auto *XGBOOST_RESTRICT v_ptr = reinterpret_cast<std::uint32_t *>(&v);
+       * v_ptr[0] = aligned_beg_u32_ptr[0];
+       * v_ptr[1] = aligned_beg_u32_ptr[1];
+       * // Difference between the original ptr and the aligned ptr.
+       * auto diff = reinterpret_cast<std::uintptr_t>(beg_ptr) - aligned_beg_ptr;
+       * // Beginning ptr that points to the first loaded values
+       * auto loaded_beg_ptr = reinterpret_cast<CompressedByteT const *>(&v) + diff;
+       */
+
+      // Read 5 bytes - the maximum we will need
+      tmp = static_cast<std::uint64_t>(buf[shifted - 4]) << 32 |
+            static_cast<std::uint64_t>(buf[shifted - 3]) << 24 |
+            static_cast<std::uint64_t>(buf[shifted - 2]) << 16 |
+            static_cast<std::uint64_t>(buf[shifted - 1]) << 8 | buf[shifted];
+    }
+
+    std::int32_t bit_shift = (kBitsPerByte - ((offset_ + 1) * symbol_bits_)) % kBitsPerByte;
+    tmp >>= bit_shift;
+    // Mask off unneeded bits
+    std::uint64_t mask = (static_cast<std::uint64_t>(1) << symbol_bits_) - 1;
+    return static_cast<OutT>(tmp & mask);
+  }
+
+  XGBOOST_DEVICE reference operator[](std::size_t idx) const {
+    self_type offset = (*this);
+    offset.offset_ += idx;
+    return *offset;
+  }
+};
 }  // namespace xgboost::common
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index c5ea61066ee4..ee4aafa6a08d 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost contributors
+ * Copyright 2019-2025, XGBoost contributors
  */
 #include <thrust/binary_search.h>                       // for lower_bound,  upper_bound
 #include <thrust/extrema.h>                             // for max_element
@@ -229,11 +229,11 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
   }
 }
 
-template <typename AdapterBatchT>
+template <typename AdapterBatchT, typename IterT>
 struct WriteCompressedEllpackFunctor {
   WriteCompressedEllpackFunctor(common::CompressedByteT* buffer,
                                 const common::CompressedBufferWriter& writer, AdapterBatchT batch,
-                                EllpackDeviceAccessor accessor,
+                                EllpackAccessorImpl<IterT> accessor,
                                 common::Span<FeatureType const> feature_types,
                                 const data::IsValidFunctor& is_valid)
       : d_buffer(buffer),
@@ -246,7 +246,7 @@ struct WriteCompressedEllpackFunctor {
   common::CompressedByteT* d_buffer;
   common::CompressedBufferWriter writer;
   AdapterBatchT batch;
-  EllpackDeviceAccessor accessor;
+  EllpackAccessorImpl<IterT> accessor;
   common::Span<FeatureType const> feature_types;
   data::IsValidFunctor is_valid;
 
@@ -259,9 +259,9 @@ struct WriteCompressedEllpackFunctor {
   __device__ void Write(data::COOTuple const& e, bst_idx_t out_position) {
     bst_bin_t bin_idx = 0;
     if (common::IsCat(feature_types, e.column_idx)) {
-      bin_idx = accessor.SearchBin<true>(e.value, e.column_idx);
+      bin_idx = accessor.template SearchBin<true>(e.value, e.column_idx);
     } else {
-      bin_idx = accessor.SearchBin<false>(e.value, e.column_idx);
+      bin_idx = accessor.template SearchBin<false>(e.value, e.column_idx);
     }
     if constexpr (kIsDenseCompressed) {
       bin_idx -= accessor.feature_segments[e.column_idx];
@@ -317,42 +317,47 @@ void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
   common::CompressedBufferWriter writer{n_symbols};
   auto d_compressed_buffer = dst->gidx_buffer.data();
 
-  // We redirect the scan output into this functor to do the actual writing
-  using Tuple = typename WriteCompressedEllpackFunctor<AdapterBatchT>::Tuple;
-  dh::TypedDiscard<Tuple> discard;
-  auto device_accessor = dst->GetDeviceAccessor(ctx);
-  WriteCompressedEllpackFunctor<AdapterBatchT> functor{
-      d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid};
-
-  // For dense compressed data, we can simply copy the data with the input position.
-  if (kIsDenseCompressed) {
-    CHECK(batch.NumRows() == 0 || batch.NumCols() == dst->info.row_stride);
-    thrust::for_each_n(ctx->CUDACtx()->CTP(), cnt, dst->Size() * dst->info.row_stride, functor);
-    return;
-  }
+  auto get_ridx = [=] __device__(std::size_t idx) {
+    return batch.GetElement(idx).row_idx;
+  };  // NOLINT
+  auto get_is_valid = [=] __device__(std::size_t idx) -> std::size_t {
+    return is_valid(batch.GetElement(idx));
+  };
+  dst->Visit(ctx, {}, [&](auto&& device_accessor) {
+    using IterT = typename std::remove_reference_t<decltype(device_accessor)>::IterType;
+    // We redirect the scan output into this functor to do the actual writing
+    using Tuple = typename WriteCompressedEllpackFunctor<AdapterBatchT, IterT>::Tuple;
+    dh::TypedDiscard<Tuple> discard;
+    WriteCompressedEllpackFunctor<AdapterBatchT, IterT> functor{
+        d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid};
+    // For dense compressed data, we can simply copy the data with the input position.
+    if (kIsDenseCompressed) {
+      CHECK(batch.NumRows() == 0 || batch.NumCols() == dst->info.row_stride);
+      thrust::for_each_n(ctx->CUDACtx()->CTP(), cnt, dst->Size() * dst->info.row_stride, functor);
+      return;
+    }
 
-  // Some witchcraft happens here.
-  //
-  // The goal is to copy valid elements out of the input to an ELLPACK matrix with a given
-  // row stride, using no extra working memory Standard stream compaction needs to be
-  // modified to do this, so we manually define a segmented stream compaction via
-  // operators on an inclusive scan. The output of this inclusive scan is fed to a custom
-  // function which works out the correct output position
-  auto key_iter = dh::MakeTransformIterator<size_t>(
-      cnt, [=] __device__(size_t idx) { return batch.GetElement(idx).row_idx; });
-  auto value_iter = dh::MakeTransformIterator<size_t>(
-      cnt, [=] __device__(size_t idx) -> size_t { return is_valid(batch.GetElement(idx)); });
-
-  auto key_value_index_iter =
-      thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, cnt));
-  thrust::transform_output_iterator<decltype(functor), decltype(discard)> out(discard, functor);
-  common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp<Tuple>{}, batch.Size());
+    // Some witchcraft happens here.
+    //
+    // The goal is to copy valid elements out of the input to an ELLPACK matrix with a given
+    // row stride, using no extra working memory Standard stream compaction needs to be
+    // modified to do this, so we manually define a segmented stream compaction via
+    // operators on an inclusive scan. The output of this inclusive scan is fed to a custom
+    // function which works out the correct output position
+    auto key_iter = dh::MakeTransformIterator<size_t>(cnt, get_ridx);
+    auto value_iter = dh::MakeTransformIterator<size_t>(cnt, get_is_valid);
+
+    auto key_value_index_iter =
+        thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, cnt));
+    thrust::transform_output_iterator<decltype(functor), decltype(discard)> out(discard, functor);
+    common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp<Tuple>{}, batch.Size());
+  });
 }
 
 void WriteNullValues(Context const* ctx, EllpackPageImpl* dst,
                      common::Span<size_t const> row_counts) {
   // Write the null values
-  auto null = dst->GetDeviceAccessor(ctx).NullValue();
+  auto null = dst->NullValue();;
   common::CompressedBufferWriter writer(dst->NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.data();
   auto row_stride = dst->info.row_stride;
@@ -472,13 +477,14 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                 cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 
-  auto accessor = this->GetDeviceAccessor(ctx, ft);
   this->monitor_.Start("CopyGHistToEllpack");
-  common::DispatchBinType(page.index.GetBinTypeSize(), [&](auto t) {
-    using T = decltype(t);
-    CopyGHistToEllpack<T>(ctx, page, d_row_ptr, this->info.row_stride, accessor.NullValue(),
-                          this->NumSymbols(), this->cuts_->cut_ptrs_.ConstDeviceSpan(),
-                          d_compressed_buffer);
+  this->Visit(ctx, ft, [&](auto&& accessor) {
+    common::DispatchBinType(page.index.GetBinTypeSize(), [&](auto t) {
+      using T = decltype(t);
+      CopyGHistToEllpack<T>(ctx, page, d_row_ptr, this->info.row_stride, accessor.NullValue(),
+                            this->NumSymbols(), this->cuts_->cut_ptrs_.ConstDeviceSpan(),
+                            d_compressed_buffer);
+    });
   });
   this->monitor_.Stop("CopyGHistToEllpack");
 }
@@ -496,17 +502,18 @@ EllpackPageImpl::~EllpackPageImpl() noexcept(false) {
 }
 
 // A functor that copies the data from one EllpackPage to another.
+template <typename IterT>
 struct CopyPage {
   common::CompressedBufferWriter cbw;
   common::CompressedByteT* dst_data_d;
-  common::CompressedIterator<uint32_t> src_iterator_d;
+  IterT src_iterator_d;
   // The number of elements to skip.
   size_t offset;
 
-  CopyPage(EllpackPageImpl* dst, EllpackPageImpl const* src, size_t offset)
+  CopyPage(EllpackPageImpl* dst, EllpackAccessorImpl<IterT> src, size_t offset)
       : cbw{dst->NumSymbols()},
         dst_data_d{dst->gidx_buffer.data()},
-        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
+        src_iterator_d{src.gidx_iter},
         offset{offset} {}
 
   __device__ void operator()(std::size_t element_id) {
@@ -522,17 +529,20 @@ bst_idx_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page,
   CHECK_EQ(this->info.row_stride, page->info.row_stride);
   CHECK_EQ(this->NumSymbols(), page->NumSymbols());
   CHECK_GE(this->n_rows * this->info.row_stride, offset + n_elements);
-  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), n_elements,
-                     CopyPage{this, page, offset});
+  page->Visit(ctx, {}, [&](auto&& src) {
+    thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), n_elements,
+                       CopyPage{this, src, offset});
+  });
   monitor_.Stop(__func__);
   return n_elements;
 }
 
 // A functor that compacts the rows from one EllpackPage into another.
+template <typename IterT>
 struct CompactPage {
   common::CompressedBufferWriter cbw;
   common::CompressedByteT* dst_data_d;
-  common::CompressedIterator<uint32_t> src_iterator_d;
+  IterT src_iterator_d;
   /**
    * @brief An array that maps the rows from the full DMatrix to the compacted page.
    *
@@ -548,13 +558,14 @@ struct CompactPage {
   size_t base_rowid;
   size_t row_stride;
 
-  CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span<size_t> row_indexes)
+  CompactPage(EllpackPageImpl* dst, EllpackAccessorImpl<IterT> src,
+              common::Span<size_t> row_indexes)
       : cbw{dst->NumSymbols()},
         dst_data_d{dst->gidx_buffer.data()},
-        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
+        src_iterator_d{src.gidx_iter},
         row_indexes(row_indexes),
-        base_rowid{src->base_rowid},
-        row_stride{src->info.row_stride} {}
+        base_rowid{src.base_rowid},
+        row_stride{src.row_stride} {}
 
   __device__ void operator()(bst_idx_t row_id) {
     size_t src_row = base_rowid + row_id;
@@ -578,7 +589,10 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
   CHECK_EQ(this->NumSymbols(), page->NumSymbols());
   CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
   auto cuctx = ctx->CUDACtx();
-  dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes});
+  page->Visit(ctx, {}, [&](auto&& src) {
+    dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, src, row_indexes});
+  });
+
   monitor_.Stop(__func__);
 }
 
@@ -606,7 +620,7 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro
   }
 
   this->monitor_.Start(__func__);
-  auto null_gidx_value = this->GetDeviceAccessor(ctx, feature_types).NullValue();
+  auto null_gidx_value = this->NullValue();
 
   auto const& offset_vec = row_batch.offset.ConstHostVector();
 
@@ -655,12 +669,13 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro
     const dim3 block3(32, 8, 1);  // 256 threads
     const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
                      common::DivRoundUp(this->info.row_stride, block3.y), 1);
-    auto device_accessor = this->GetDeviceAccessor(ctx);
     auto launcher = [&](auto kernel) {
-      dh::LaunchKernel{grid3, block3, 0, ctx->CUDACtx()->Stream()}(  // NOLINT
-          kernel, writer, gidx_buffer_data, row_ptrs.data(), entries_d.data(),
-          device_accessor.gidx_fvalue_map.data(), device_accessor.feature_segments, feature_types,
-          batch_row_begin, batch_nrows, this->info.row_stride, null_gidx_value);
+      this->Visit(ctx, {}, [&](auto&& device_accessor) {
+        dh::LaunchKernel{grid3, block3, 0, ctx->CUDACtx()->Stream()}(  // NOLINT
+            kernel, writer, gidx_buffer_data, row_ptrs.data(), entries_d.data(),
+            device_accessor.gidx_fvalue_map.data(), device_accessor.feature_segments, feature_types,
+            batch_row_begin, batch_nrows, this->info.row_stride, null_gidx_value);
+      });
     };
     if (this->IsDense()) {
       launcher(CompressBinEllpackKernel<true, true>);
@@ -680,44 +695,91 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro
 
 std::size_t EllpackPageImpl::MemCostBytes() const {
   return this->gidx_buffer.size_bytes() + sizeof(this->is_dense) + sizeof(this->n_rows) +
-         sizeof(this->base_rowid) + sizeof(this->info);
+         sizeof(this->base_rowid) + sizeof(this->info) + this->d_gidx_buffer.size_bytes();
 }
 
-EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
+[[nodiscard]] EllpackAccessor EllpackPageImpl::GetDeviceEllpack(
     Context const* ctx, common::Span<FeatureType const> feature_types) const {
+  // The compress iterator reads at least 5 bytes. The `CalculateBufferSize` method should
+  // guarantee that.
+  CHECK_GE(this->gidx_buffer.size_bytes() + this->d_gidx_buffer.size_bytes(), 5);
   auto null = this->NullValue();
-  auto iter = common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()};
-  return {ctx,  this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
-          iter, null,        this->IsDense(),       feature_types};
+  if (d_gidx_buffer.empty()) {
+    auto iter = common::CompressedIterator<std::uint32_t>{gidx_buffer.data(), this->NumSymbols()};
+    return EllpackDeviceAccessor{
+        ctx,  this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+        iter, null,        this->IsDense(),       feature_types};
+  } else {
+    auto iter = common::DoubleCompressedIter<std::uint32_t>{
+        gidx_buffer.data(), gidx_buffer.size_bytes(), d_gidx_buffer.data(), this->NumSymbols()};
+    return DoubleEllpackAccessor{
+        ctx,  this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+        iter, null,        this->IsDense(),       feature_types};
+  }
 }
 
-EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
+[[nodiscard]] EllpackAccessor EllpackPageImpl::GetHostEllpack(
     Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
     common::Span<FeatureType const> feature_types) const {
-  h_gidx_buffer->resize(gidx_buffer.size());
-  CHECK_EQ(h_gidx_buffer->size(), gidx_buffer.size());
+  CHECK_GE(this->gidx_buffer.size_bytes() + this->d_gidx_buffer.size_bytes(), 5);
+  auto null = this->NullValue();
+
+  h_gidx_buffer->resize(this->gidx_buffer.size() + this->d_gidx_buffer.size());
   CHECK_NE(gidx_buffer.size(), 0);
-  dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
-                                cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
+  dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), this->gidx_buffer.data(),
+                                this->gidx_buffer.size_bytes(), cudaMemcpyDefault,
+                                ctx->CUDACtx()->Stream()));
+
+  if (!d_gidx_buffer.empty()) {
+    auto dst = h_gidx_buffer->data() + this->gidx_buffer.size_bytes();
+    auto src = d_gidx_buffer.data();
+    dh::safe_cuda(cudaMemcpyAsync(dst, src, this->d_gidx_buffer.size_bytes(), cudaMemcpyDefault,
+                                  ctx->CUDACtx()->Stream()));
+
+    auto iter = common::DoubleCompressedIter<std::uint32_t>{
+        h_gidx_buffer->data(), gidx_buffer.size_bytes(), dst, this->NumSymbols()};
+    return DoubleEllpackAccessor{
+        ctx,  this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+        iter, null,        this->IsDense(),       feature_types};
+  }
+
+  auto iter = common::CompressedIterator<std::uint32_t>{h_gidx_buffer->data(), this->NumSymbols()};
   Context cpu_ctx;
-  auto null = this->NullValue();
-  auto iter = common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()};
   auto sctx = ctx->IsCPU() ? ctx : &cpu_ctx;
-  return {sctx, this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
-          iter, null,        this->IsDense(),       feature_types};
+  return EllpackDeviceAccessor{
+      sctx, this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+      iter, null,        this->IsDense(),       feature_types};
 }
 
+namespace {
+template <typename Accessor>
+struct CntOp {
+  Accessor d_acc;
+  explicit CntOp(Accessor d_acc) : d_acc{std::move(d_acc)} {}
+  XGBOOST_DEVICE auto operator()(std::size_t i) { return d_acc.gidx_iter[i]; }
+};
+template <typename Accessor>
+struct NotNullOp {
+  Accessor d_acc;
+  explicit NotNullOp(Accessor d_acc) : d_acc{std::move(d_acc)} {}
+
+  template <typename T>
+  XGBOOST_DEVICE auto operator()(T gidx) -> bool {
+    return gidx != d_acc.NullValue();
+  }
+};
+}  // namespace
+
 [[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing(
     Context const* ctx, common::Span<FeatureType const> feature_types) const {
   if (this->IsDense()) {
     return this->n_rows * this->info.row_stride;
   }
-  auto d_acc = this->GetDeviceAccessor(ctx, feature_types);
-  using T = typename decltype(d_acc.gidx_iter)::value_type;
-  auto it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0ull),
-      [=] XGBOOST_DEVICE(std::size_t i) { return d_acc.gidx_iter[i]; });
-  return thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows,
-                          [=] XGBOOST_DEVICE(T gidx) -> bool { return gidx != d_acc.NullValue(); });
+  return this->Visit(ctx, feature_types, [&](auto&& d_acc) -> bst_idx_t {
+    using T = typename decltype(d_acc.gidx_iter)::value_type;
+    auto it = thrust::make_transform_iterator(thrust::make_counting_iterator(0ull), CntOp{d_acc});
+    return thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows,
+                            NotNullOp{d_acc});
+  });
 }
 }  // namespace xgboost
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 6e3858ba527d..26d6f7520011 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
 #define XGBOOST_DATA_ELLPACK_PAGE_CUH_
@@ -22,7 +22,8 @@ namespace xgboost {
  *
  * Does not own the underlying memory and may be trivially copied into kernels.
  */
-struct EllpackDeviceAccessor {
+template <typename IterT>
+struct EllpackAccessorImpl {
  private:
   /**
    * @brief Stores the null value and whether the matrix is dense. The `IsDense` is stored in the
@@ -33,6 +34,9 @@ struct EllpackDeviceAccessor {
   constexpr static auto Ind() { return static_cast<bst_idx_t>(1); }
   constexpr static std::size_t NullShift() { return sizeof(null_value_) * 8 - Ind(); }
 
+ public:
+  using IterType = IterT;
+
  public:
   /** @brief Row length for ELLPACK, equal to number of features when the data is dense. */
   bst_idx_t row_stride;
@@ -41,7 +45,7 @@ struct EllpackDeviceAccessor {
   /** @brief Number of rows in this batch. */
   bst_idx_t n_rows;
   /** @brief Acessor for the gradient index. */
-  common::CompressedIterator<std::uint32_t> gidx_iter;
+  IterType gidx_iter;
   /** @brief Minimum value for each feature. Size equals to number of features. */
   common::Span<const float> min_fvalue;
   /** @brief Histogram cut pointers. Size equals to (number of features + 1). */
@@ -51,11 +55,11 @@ struct EllpackDeviceAccessor {
   /** @brief Type of each feature, categorical or numerical. */
   common::Span<const FeatureType> feature_types;
 
-  EllpackDeviceAccessor() = delete;
-  EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
-                        bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
-                        common::CompressedIterator<std::uint32_t> gidx_iter, bst_idx_t null_value,
-                        bool is_dense, common::Span<FeatureType const> feature_types)
+  EllpackAccessorImpl() = delete;
+  EllpackAccessorImpl(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
+                      bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
+                      IterType gidx_iter, bst_idx_t null_value, bool is_dense,
+                      common::Span<FeatureType const> feature_types)
       : null_value_{null_value},
         row_stride{row_stride},
         base_rowid{base_rowid},
@@ -158,6 +162,15 @@ struct EllpackDeviceAccessor {
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); }
 };
 
+using EllpackDeviceAccessor = EllpackAccessorImpl<common::CompressedIterator<std::uint32_t>>;
+
+using DoubleEllpackAccessor = EllpackAccessorImpl<common::DoubleCompressedIter<std::uint32_t>>;
+
+/**
+ * @brief The ellpack accessor uses different graident index iterator to facilitate
+ *        external memory training.
+ */
+using EllpackAccessor = std::variant<EllpackDeviceAccessor, DoubleEllpackAccessor>;
 
 class GHistIndexMatrix;
 
@@ -287,16 +300,54 @@ class EllpackPageImpl {
     this->SetNumSymbols(page->NumSymbols());
   }
   /**
-   * @brief Get an accessor that can be passed into CUDA kernels.
+   * @brief Get an accessor backed by the device storage.
    */
-  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
+  [[nodiscard]] EllpackAccessor GetDeviceEllpack(
       Context const* ctx, common::Span<FeatureType const> feature_types = {}) const;
   /**
-   * @brief Get an accessor for host code.
+   * @brief Get an accessor backed by the host storage.
+   *
+   * @param h_gidx_buffer A buffer used as the backing storage of the accessor.
+   *
+   * @return An accessor variant.
    */
-  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+  [[nodiscard]] EllpackAccessor GetHostEllpack(
       Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
       common::Span<FeatureType const> feature_types = {}) const;
+  /**
+   * @brief Vistor pattern.
+   *
+   * @param fn A callable that accepts both variants of the ellpack accessor.
+   *
+   * @return An accessor variant.
+   */
+  template <typename Fn>
+  decltype(auto) Visit(Context const* ctx, common::Span<FeatureType const> feature_types,
+                       Fn&& fn) const {
+    auto acc = this->GetDeviceEllpack(ctx, feature_types);
+    return std::visit(std::forward<Fn>(fn), acc);
+  }
+  /**
+   * @brief Vistor pattern with a host accessor.
+   *
+   * @param h_gidx_buffer A buffer used as the backing storage of the accessor.
+   * @param fn A callable that accepts both variants of the ellpack accessor.
+   */
+  template <typename Fn>
+  decltype(auto) VisitOnHost(Context const* ctx,
+                             std::vector<common::CompressedByteT>* h_gidx_buffer,
+                             common::Span<FeatureType const> feature_types, Fn&& fn) const {
+    auto acc = this->GetHostEllpack(ctx, h_gidx_buffer, feature_types);
+    return std::visit(std::forward<Fn>(fn), acc);
+  }
+  // helper for visit that doesn't need the raw data.
+  template <typename Fn>
+  decltype(auto) VisitOnHost(Context const* ctx, Fn&& fn) const {
+    common::Span<FeatureType const> feature_types;
+    std::vector<common::CompressedByteT> h_gidx_buffer;
+    auto acc = this->GetHostEllpack(ctx, &h_gidx_buffer, feature_types);
+    return std::visit(std::forward<Fn>(fn), acc);
+  }
   /**
    * @brief Calculate the number of non-missing values.
    */
@@ -329,6 +380,13 @@ class EllpackPageImpl {
    * This can be backed by various storage types.
    */
   common::RefResourceView<common::CompressedByteT> gidx_buffer;
+  /**
+   * @brief Second buffer. Used for external memory where we might have a part of the
+   * cache in device and another part of the cache in host.
+   *
+   * This buffer is optional. It must be on device if not empty.
+   */
+  common::RefResourceView<common::CompressedByteT const> d_gidx_buffer;
   /**
    * @brief Compression infomation.
    */
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index b5dc16604b18..9c8c039fc9c0 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost contributors
+ * Copyright 2019-2025, XGBoost contributors
  */
 #include <dmlc/registry.h>
 
@@ -86,7 +86,8 @@ template <typename T>
   bytes += fo->Write(impl->info.row_stride);
   std::vector<common::CompressedByteT> h_gidx_buffer;
   Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
-  [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
+  // write data into the h_gidx_buffer
+  [[maybe_unused]] auto h_accessor = impl->GetHostEllpack(&ctx, &h_gidx_buffer);
   bytes += common::WriteVec(fo, h_gidx_buffer);
   bytes += fo->Write(impl->base_rowid);
   bytes += fo->Write(impl->NumSymbols());
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 383f07374ace..88f2dd389d6a 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -230,10 +230,6 @@ class EllpackHostCacheStreamImpl {
     auto [h_page, d_page] = this->cache_->At(this->ptr_);
 
     auto ctx = Context{}.MakeCUDA(dh::CurrentDevice());
-    // FIXME(jiamingy): Accessing split cache directly is not yet supported.
-    if (0.0 < this->cache_->cache_host_ratio && this->cache_->cache_host_ratio < 1.0) {
-      prefetch_copy = true;
-    }
     auto out_impl = out->Impl();
     if (prefetch_copy) {
       auto n_bytes = this->cache_->GidxSizeBytes(this->ptr_);
@@ -249,10 +245,15 @@ class EllpackHostCacheStreamImpl {
                                       ctx.CUDACtx()->Stream()));
       }
     } else {
-      CHECK(d_page->empty() || h_page->gidx_buffer.empty());
-      auto res = d_page->empty() ? h_page->gidx_buffer.Resource() : d_page->Resource();
+      auto h_res = h_page->gidx_buffer.Resource();
+      CHECK(h_res->DataAs<common::CompressedByteT>() == h_page->gidx_buffer.data());
       out_impl->gidx_buffer = common::RefResourceView<common::CompressedByteT>{
-          res->DataAs<common::CompressedByteT>(), h_page->gidx_buffer.size(), res};
+          h_res->DataAs<common::CompressedByteT>(), h_page->gidx_buffer.size(), h_res};
+      CHECK(out_impl->d_gidx_buffer.empty());
+      if (!d_page->empty()) {
+        out_impl->d_gidx_buffer = common::RefResourceView<common::CompressedByteT const>{
+            d_page->data(), d_page->size(), d_page->Resource()};
+      }
     }
 
     out_impl->CopyInfo(h_page);
diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu
index 5e15ff5f0fa2..6717741bcda7 100644
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #include <cstddef>  // for size_t
 #include <memory>   // for unique_ptr
@@ -18,39 +18,39 @@ template <typename BinT, typename DecompressOffset>
 void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
                   std::vector<size_t>* p_hit_count_tloc, DecompressOffset&& get_offset,
                   GHistIndexMatrix* out) {
-  std::vector<common::CompressedByteT> h_gidx_buffer;
-  auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
-  auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
+  page->VisitOnHost(ctx, [&](auto&& accessor) {
+    auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
-  auto index_data_span = common::Span{out->index.data<BinT>(), out->index.Size()};
-  auto n_bins_total = page->Cuts().TotalBins();
+    auto index_data_span = common::Span{out->index.data<BinT>(), out->index.Size()};
+    auto n_bins_total = page->Cuts().TotalBins();
 
-  auto& hit_count_tloc = *p_hit_count_tloc;
-  hit_count_tloc.clear();
-  hit_count_tloc.resize(ctx->Threads() * n_bins_total, 0);
-  bool dense_compressed = page->IsDenseCompressed() && !page->IsDense();
-  common::ParallelFor(page->Size(), ctx->Threads(), [&](auto ridx) {
-    auto tid = omp_get_thread_num();
-    size_t in_rbegin = page->info.row_stride * ridx;
-    size_t out_rbegin = out->row_ptr[ridx];
-    if (dense_compressed) {
-      for (std::size_t j = 0, k = 0; j < page->info.row_stride; ++j) {
-        bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j];
-        if (XGBOOST_EXPECT((bin_idx != kNull), true)) {  // relatively dense
-          bin_idx = get_offset(bin_idx, j);
-          index_data_span[out_rbegin + k++] = bin_idx;
-          ++hit_count_tloc[tid * n_bins_total + bin_idx];
+    auto& hit_count_tloc = *p_hit_count_tloc;
+    hit_count_tloc.clear();
+    hit_count_tloc.resize(ctx->Threads() * n_bins_total, 0);
+    bool dense_compressed = page->IsDenseCompressed() && !page->IsDense();
+    common::ParallelFor(page->Size(), ctx->Threads(), [&](auto ridx) {
+      auto tid = omp_get_thread_num();
+      size_t in_rbegin = page->info.row_stride * ridx;
+      size_t out_rbegin = out->row_ptr[ridx];
+      if (dense_compressed) {
+        for (std::size_t j = 0, k = 0; j < page->info.row_stride; ++j) {
+          bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j];
+          if (XGBOOST_EXPECT((bin_idx != kNull), true)) {  // relatively dense
+            bin_idx = get_offset(bin_idx, j);
+            index_data_span[out_rbegin + k++] = bin_idx;
+            ++hit_count_tloc[tid * n_bins_total + bin_idx];
+          }
+        }
+      } else {
+        auto r_size = out->row_ptr[ridx + 1] - out->row_ptr[ridx];
+        for (size_t j = 0; j < r_size; ++j) {
+          bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j];
+          assert(bin_idx != kNull);
+          index_data_span[out_rbegin + j] = bin_idx;
+          ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)];
         }
       }
-    } else {
-      auto r_size = out->row_ptr[ridx + 1] - out->row_ptr[ridx];
-      for (size_t j = 0; j < r_size; ++j) {
-        bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j];
-        assert(bin_idx != kNull);
-        index_data_span[out_rbegin + j] = bin_idx;
-        ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)];
-      }
-    }
+    });
   });
 }
 
@@ -61,18 +61,18 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
   if (page->IsDense()) {
     std::fill(row_ptr.begin() + 1, row_ptr.end(), page->info.row_stride);
   } else {
-    std::vector<common::CompressedByteT> h_gidx_buffer;
-    auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
-    auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
+    page->VisitOnHost(ctx, [&](auto& accessor) {
+      auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
-    common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
-      size_t ibegin = page->info.row_stride * i;
-      for (size_t j = 0; j < page->info.row_stride; ++j) {
-        bst_bin_t bin_idx = accessor.gidx_iter[ibegin + j];
-        if (bin_idx != kNull) {
-          row_ptr[i + 1]++;
+      common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
+        size_t ibegin = page->info.row_stride * i;
+        for (size_t j = 0; j < page->info.row_stride; ++j) {
+          bst_bin_t bin_idx = accessor.gidx_iter[ibegin + j];
+          if (bin_idx != kNull) {
+            row_ptr[i + 1]++;
+          }
         }
-      }
+      });
     });
   }
   std::partial_sum(row_ptr.begin(), row_ptr.end(), row_ptr.begin());
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index f00641d9f5a7..67f26b09dec7 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -151,17 +151,16 @@ struct SparsePageLoader {
   }
 };
 
-template <typename EncAccessor>
+template <typename Accessor, typename EncAccessor>
 struct EllpackLoader {
-  EllpackDeviceAccessor matrix;
+  Accessor matrix;
   EncAccessor acc;
 
-  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor m, bool /*use_shared*/,
-                               bst_feature_t /*n_features*/, bst_idx_t /*n_samples*/,
-                               float /*missing*/, EncAccessor&& acc)
+  XGBOOST_DEVICE EllpackLoader(Accessor m, bool /*use_shared*/, bst_feature_t /*n_features*/,
+                               bst_idx_t /*n_samples*/, float /*missing*/, EncAccessor&& acc)
       : matrix{std::move(m)}, acc{std::forward<EncAccessor>(acc)} {}
   [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
-    auto gidx = matrix.GetBinIndex<false>(ridx, fidx);
+    auto gidx = matrix.template GetBinIndex<false>(ridx, fidx);
     if (gidx == -1) {
       return std::numeric_limits<float>::quiet_NaN();
     }
@@ -179,6 +178,12 @@ struct EllpackLoader {
   [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return this->matrix.n_rows; }
 };
 
+template <typename Accessor>
+struct EllpackPartial {
+  template <typename EncAccessor>
+  using Type = EllpackLoader<Accessor, EncAccessor>;
+};
+
 /**
  * @brief Use for in-place predict.
  */
@@ -1058,13 +1063,15 @@ class GPUPredictor : public xgboost::Predictor {
 
       bst_idx_t batch_offset = 0;
       for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        auto batch = page.Impl()->GetDeviceAccessor(ctx_, feature_types);
-        // No shared memory use for ellpack
-        bst_feature_t n_features = batch.NumFeatures();
-        LaunchConfig<256, false> cfg{this->ctx_, n_features};
-        cfg.LaunchPredict<EllpackLoader>(
-            this->ctx_, std::move(batch), std::numeric_limits<float>::quiet_NaN(), page.Size(),
-            n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
+        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& batch) {
+          using Acc = std::remove_reference_t<decltype(batch)>;
+          // No shared memory use for ellpack
+          bst_feature_t n_features = batch.NumFeatures();
+          LaunchConfig<256, false> cfg{this->ctx_, n_features};
+          cfg.LaunchPredict<EllpackPartial<Acc>::template Type>(
+              this->ctx_, std::move(batch), std::numeric_limits<float>::quiet_NaN(), page.Size(),
+              n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
+        });
         batch_offset += page.Size() * model.learner_model_param->OutputLength();
       }
     }
@@ -1221,19 +1228,20 @@ class GPUPredictor : public xgboost::Predictor {
       p_fmat->Info().feature_types.SetDevice(ctx_->Device());
       auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
 
-      for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        EllpackDeviceAccessor ellpack{batch.Impl()->GetDeviceAccessor(ctx_, feature_types)};
-        auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
-        LaunchShapKernel(this->ctx_, new_enc, d_model, [&](auto&& acc) {
-          using EncAccessor = std::remove_reference_t<decltype(acc)>;
-          auto X = EllpackLoader{ellpack,
-                                 true,
-                                 model.learner_model_param->num_feature,
-                                 batch.Size(),
-                                 std::numeric_limits<float>::quiet_NaN(),
-                                 std::forward<EncAccessor>(acc)};
-          gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
-              X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& ellpack) {
+          auto begin = dh::tbegin(phis) + page.BaseRowId() * dim_size;
+          LaunchShapKernel(this->ctx_, new_enc, d_model, [&](auto&& acc) {
+            using EncAccessor = std::remove_reference_t<decltype(acc)>;
+            auto X = EllpackLoader{ellpack,
+                                   true,
+                                   model.learner_model_param->num_feature,
+                                   page.Size(),
+                                   std::numeric_limits<float>::quiet_NaN(),
+                                   std::forward<EncAccessor>(acc)};
+            gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
+                X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+          });
         });
       }
     }
@@ -1302,22 +1310,22 @@ class GPUPredictor : public xgboost::Predictor {
       p_fmat->Info().feature_types.SetDevice(ctx_->Device());
       auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
 
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        auto impl = batch.Impl();
-        auto ellpack = impl->GetDeviceAccessor(ctx_, feature_types);
-        auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
-        auto launch = [&](auto&& acc) {
-          using EncAccessor = std::remove_reference_t<decltype(acc)>;
-          auto X = EllpackLoader{ellpack,
-                                 /*use_shared=*/false,
-                                 model.learner_model_param->num_feature,
-                                 batch.Size(),
-                                 std::numeric_limits<float>::quiet_NaN(),
-                                 std::forward<EncAccessor>(acc)};
-          gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
-              X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
-        };
-        LaunchShapKernel(this->ctx_, new_enc, d_model, launch);
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& ellpack) {
+          auto begin = dh::tbegin(phis) + page.BaseRowId() * dim_size;
+          auto launch = [&](auto&& acc) {
+            using EncAccessor = std::remove_reference_t<decltype(acc)>;
+            auto X = EllpackLoader{ellpack,
+                                   /*use_shared=*/false,
+                                   model.learner_model_param->num_feature,
+                                   page.Size(),
+                                   std::numeric_limits<float>::quiet_NaN(),
+                                   std::forward<EncAccessor>(acc)};
+            gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
+                X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+          };
+          LaunchShapKernel(this->ctx_, new_enc, d_model, launch);
+        });
       }
     }
 
@@ -1376,12 +1384,14 @@ class GPUPredictor : public xgboost::Predictor {
       auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
 
       bst_idx_t batch_offset = 0;
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_, feature_types)};
-        cfg.LaunchLeaf<EllpackLoader>(this->ctx_, std::move(data), batch.Size(), n_features,
-                                      d_model, p_fmat->IsDense(), new_enc, batch_offset,
-                                      predictions);
-        batch_offset += batch.Size();
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& batch) {
+          using Acc = std::remove_reference_t<decltype(batch)>;
+          cfg.LaunchLeaf<EllpackPartial<Acc>::template Type>(
+              this->ctx_, std::forward<Acc>(batch), page.Size(), n_features, d_model,
+              p_fmat->IsDense(), new_enc, batch_offset, predictions);
+        });
+        batch_offset += page.Size();
       }
     }
   }
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 08ef49a1b393..a060cf2aded8 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -30,7 +30,8 @@ XGBOOST_DEV_INLINE bst_feature_t FeatIdx(FeatureGroup const& group, bst_idx_t id
   return fidx;
 }
 
-XGBOOST_DEV_INLINE bst_idx_t IterIdx(EllpackDeviceAccessor const& matrix,
+template <typename IterT>
+XGBOOST_DEV_INLINE bst_idx_t IterIdx(EllpackAccessorImpl<IterT> const& matrix,
                                      RowPartitioner::RowIndexT ridx, bst_feature_t fidx) {
   // ridx_local = ridx - base_rowid  <== Row index local to each batch
   // entry_idx = ridx_local * row_stride <== Starting entry index for this row in the matrix
@@ -116,7 +117,7 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
 
 XGBOOST_DEV_INLINE void AtomicAddGpairShared(xgboost::GradientPairInt64* dest,
                                              xgboost::GradientPairInt64 const& gpair) {
-  auto dst_ptr = reinterpret_cast<int64_t *>(dest);
+  auto dst_ptr = reinterpret_cast<int64_t*>(dest);
   auto g = gpair.GetQuantisedGrad();
   auto h = gpair.GetQuantisedHess();
 
@@ -136,7 +137,7 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
   atomicAdd(dst_ptr + 1, *reinterpret_cast<uint64_t*>(&h));
 }
 
-template <bool kCompressed, bool kDense, int kBlockThreads, int kItemsPerThread>
+template <typename Accessor, bool kCompressed, bool kDense, int kBlockThreads, int kItemsPerThread>
 class HistogramAgent {
   int constexpr static kItemsPerTile = kBlockThreads * kItemsPerThread;
 
@@ -147,7 +148,7 @@ class HistogramAgent {
   dh::LDGIterator<const Idx> d_ridx_;
   const GradientPair* d_gpair_;
   const FeatureGroup group_;
-  const EllpackDeviceAccessor& matrix_;
+  Accessor const& matrix_;
   const int feature_stride_;
   const bst_idx_t n_elements_;
   const GradientQuantiser& rounding_;
@@ -157,7 +158,7 @@ class HistogramAgent {
  public:
   __device__ HistogramAgent(GradientPairInt64* smem_arr,
                             GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
-                            const EllpackDeviceAccessor& matrix, common::Span<const Idx> d_ridx,
+                            Accessor const& matrix, common::Span<const Idx> d_ridx,
                             const GradientQuantiser& rounding, const GradientPair* d_gpair)
       : smem_arr_{smem_arr},
         d_node_hist_{d_node_hist},
@@ -264,11 +265,10 @@ class HistogramAgent {
   }
 };
 
-template <bool kCompressed, bool kDense, bool use_shared_memory_histograms, int kBlockThreads,
-          int kItemsPerThread>
+template <typename Accessor, bool kCompressed, bool kDense, bool use_shared_memory_histograms,
+          int kBlockThreads, int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
-    SharedMemHistKernel(const EllpackDeviceAccessor matrix,
-                        const FeatureGroupsAccessor feature_groups,
+    SharedMemHistKernel(Accessor const matrix, const FeatureGroupsAccessor feature_groups,
                         common::Span<const RowPartitioner::RowIndexT> d_ridx,
                         GradientPairInt64* __restrict__ d_node_hist,
                         const GradientPair* __restrict__ d_gpair,
@@ -276,7 +276,7 @@ __global__ void __launch_bounds__(kBlockThreads)
   extern __shared__ char smem[];
   const FeatureGroup group = feature_groups[blockIdx.y];
   auto smem_arr = reinterpret_cast<GradientPairInt64*>(smem);
-  auto agent = HistogramAgent<kCompressed, kDense, kBlockThreads, kItemsPerThread>(
+  auto agent = HistogramAgent<Accessor, kCompressed, kDense, kBlockThreads, kItemsPerThread>(
       smem_arr, d_node_hist, group, matrix, d_ridx, rounding, d_gpair);
   if (use_shared_memory_histograms) {
     agent.BuildHistogramWithShared();
@@ -292,13 +292,19 @@ constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread;
 }  // namespace
 
 // Use auto deduction guide to workaround compiler error.
-template <auto GlobalCompr =
-              SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>,
-          auto Global = SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>,
-          auto SharedCompr = SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>,
-          auto Shared = SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>,
-          auto GlobalDense = SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>,
-          auto SharedDense = SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>>
+template <typename Accessor,
+          auto GlobalCompr =
+              SharedMemHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>,
+          auto Global =
+              SharedMemHistKernel<Accessor, false, false, false, kBlockThreads, kItemsPerThread>,
+          auto SharedCompr =
+              SharedMemHistKernel<Accessor, true, false, true, kBlockThreads, kItemsPerThread>,
+          auto Shared =
+              SharedMemHistKernel<Accessor, false, false, true, kBlockThreads, kItemsPerThread>,
+          auto GlobalDense =
+              SharedMemHistKernel<Accessor, true, true, false, kBlockThreads, kItemsPerThread>,
+          auto SharedDense =
+              SharedMemHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>>
 struct HistogramKernel {
   enum KernelType : std::size_t {
     kGlobalCompr = 0,
@@ -310,20 +316,20 @@ struct HistogramKernel {
   };
   // Kernel for working with dense Ellpack using the global memory.
   decltype(GlobalCompr) global_compr_kernel{
-      SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>};
+      SharedMemHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the global memory.
   decltype(Global) global_kernel{
-      SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>};
+      SharedMemHistKernel<Accessor, false, false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with dense Ellpack using the shared memory.
   decltype(SharedCompr) shared_compr_kernel{
-      SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>};
+      SharedMemHistKernel<Accessor, true, false, true, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the shared memory.
   decltype(Shared) shared_kernel{
-      SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>};
+      SharedMemHistKernel<Accessor, false, false, true, kBlockThreads, kItemsPerThread>};
   decltype(GlobalDense) global_dense_kernel{
-      SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>};
+      SharedMemHistKernel<Accessor, true, true, false, kBlockThreads, kItemsPerThread>};
   decltype(SharedDense) shared_dense_kernel{
-      SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>};
+      SharedMemHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>};
 
   bool shared{false};
   std::array<std::uint32_t, 6> grid_sizes{0, 0, 0, 0, 0, 0};
@@ -372,19 +378,21 @@ struct HistogramKernel {
   }
 };
 
-class DeviceHistogramBuilderImpl {
-  std::unique_ptr<HistogramKernel<>> kernel_{nullptr};
+template <typename Accessor>
+class DeviceHistogramDispatchAccessor {
+  std::unique_ptr<HistogramKernel<Accessor>> kernel_{nullptr};
 
  public:
   void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
              bool force_global_memory) {
-    this->kernel_ = std::make_unique<HistogramKernel<>>(ctx, feature_groups, force_global_memory);
+    this->kernel_ =
+        std::make_unique<HistogramKernel<Accessor>>(ctx, feature_groups, force_global_memory);
     if (force_global_memory) {
       CHECK(!this->kernel_->shared);
     }
   }
 
-  void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
+  void BuildHistogram(CUDAContext const* ctx, Accessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
                       common::Span<const cuda_impl::RowIndexT> d_ridx,
@@ -410,7 +418,7 @@ class DeviceHistogramBuilderImpl {
           kernel, matrix, feature_groups, d_ridx, histogram.data(), gpair.data(), rounding);
     };
 
-    using K = HistogramKernel<>::KernelType;
+    using K = HistogramKernel<EllpackDeviceAccessor>::KernelType;
     if (!this->kernel_->shared) {  // Use global memory
       CHECK_EQ(this->kernel_->smem_size, 0);
       if (matrix.IsDense()) {
@@ -439,6 +447,28 @@ class DeviceHistogramBuilderImpl {
   }
 };
 
+// Dispatch between single buffer accessor and double buffer accessor.
+struct DeviceHistogramBuilderImpl {
+  DeviceHistogramDispatchAccessor<EllpackDeviceAccessor> simpl;
+  DeviceHistogramDispatchAccessor<DoubleEllpackAccessor> dimpl;
+
+  template <typename... Args>
+  void Reset(Args&&... args) {
+    this->simpl.Reset(std::forward<Args>(args)...);
+    this->dimpl.Reset(std::forward<Args>(args)...);
+  }
+
+  template <typename Accessor, typename... Args>
+  void BuildHistogram(CUDAContext const* ctx, Accessor const& matrix, Args&&... args) {
+    if constexpr (std::is_same_v<Accessor, EllpackDeviceAccessor>) {
+      this->simpl.BuildHistogram(ctx, matrix, std::forward<Args>(args)...);
+    } else {
+      static_assert(std::is_same_v<Accessor, DoubleEllpackAccessor>);
+      this->dimpl.BuildHistogram(ctx, matrix, std::forward<Args>(args)...);
+    }
+  }
+};
+
 DeviceHistogramBuilder::DeviceHistogramBuilder()
     : p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {
   monitor_.Init(__func__);
@@ -455,15 +485,19 @@ void DeviceHistogramBuilder::Reset(Context const* ctx, std::size_t max_cached_hi
   this->monitor_.Stop(__func__);
 }
 
-void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
-                                            EllpackDeviceAccessor const& matrix,
+void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx, EllpackAccessor const& matrix,
                                             FeatureGroupsAccessor const& feature_groups,
                                             common::Span<GradientPair const> gpair,
                                             common::Span<const cuda_impl::RowIndexT> ridx,
                                             common::Span<GradientPairInt64> histogram,
                                             GradientQuantiser rounding) {
   this->monitor_.Start(__func__);
-  this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
+  std::visit(
+      [&](auto&& matrix) {
+        this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram,
+                                      rounding);
+      },
+      matrix);
   this->monitor_.Stop(__func__);
 }
 
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 55e398e1be8b..896b21633dc2 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #ifndef HISTOGRAM_CUH_
 #define HISTOGRAM_CUH_
@@ -158,7 +158,8 @@ class DeviceHistogramBuilder {
   void Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
              FeatureGroupsAccessor const& feature_groups, bst_bin_t n_total_bins,
              bool force_global_memory);
-  void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
+
+  void BuildHistogram(CUDAContext const* ctx, EllpackAccessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
                       common::Span<const std::uint32_t> ridx,
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 713bac468e90..0a9793abec93 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -111,25 +111,35 @@ struct GPUHistMakerDevice {
   std::shared_ptr<common::HistogramCuts const> const cuts_;
   std::unique_ptr<FeatureGroups> feature_groups_;
 
-  auto CreatePartitionNodes(RegTree const* p_tree, std::vector<GPUExpandEntry> const& candidates) {
-    std::vector<bst_node_t> nidx(candidates.size());
-    std::vector<bst_node_t> left_nidx(candidates.size());
-    std::vector<bst_node_t> right_nidx(candidates.size());
-    std::vector<NodeSplitData> split_data(candidates.size());
+  struct PartitionNodes {
+    std::vector<bst_node_t> nidx;
+    std::vector<bst_node_t> left_nidx;
+    std::vector<bst_node_t> right_nidx;
+    std::vector<NodeSplitData> split_data;
+
+    explicit PartitionNodes(std::size_t n_candidates)
+        : nidx(n_candidates),
+          left_nidx(n_candidates),
+          right_nidx(n_candidates),
+          split_data(n_candidates) {}
+  };
 
+  PartitionNodes CreatePartitionNodes(RegTree const* p_tree,
+                                      std::vector<GPUExpandEntry> const& candidates) {
+    PartitionNodes nodes(candidates.size());
     for (std::size_t i = 0, n = candidates.size(); i < n; i++) {
       auto const& e = candidates[i];
       RegTree::Node split_node = (*p_tree)[e.nid];
       auto split_type = p_tree->NodeSplitType(e.nid);
-      nidx.at(i) = e.nid;
-      left_nidx[i] = split_node.LeftChild();
-      right_nidx[i] = split_node.RightChild();
-      split_data[i] =
+      nodes.nidx.at(i) = e.nid;
+      nodes.left_nidx[i] = split_node.LeftChild();
+      nodes.right_nidx[i] = split_node.RightChild();
+      nodes.split_data[i] =
           NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nid)};
 
       CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
-    return std::make_tuple(nidx, left_nidx, right_nidx, split_data);
+    return nodes;
   }
 
  public:
@@ -325,13 +335,12 @@ struct GPUHistMakerDevice {
   void BuildHist(EllpackPage const& page, std::int32_t k, bst_bin_t nidx) {
     monitor.Start(__func__);
     auto d_node_hist = histogram_.GetNodeHistogram(nidx);
-    auto batch = page.Impl();
-    auto acc = batch->GetDeviceAccessor(ctx_);
-
     auto d_ridx = partitioners_.at(k)->GetRows(nidx);
-    this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
-                                    feature_groups_->DeviceAccessor(ctx_->Device()), this->gpair,
-                                    d_ridx, d_node_hist, *quantiser);
+    page.Impl()->Visit(this->ctx_, {}, [&](auto&& acc) {
+      this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
+                                      feature_groups_->DeviceAccessor(ctx_->Device()), this->gpair,
+                                      d_ridx, d_node_hist, *quantiser);
+    });
     monitor.Stop(__func__);
   }
 
@@ -367,7 +376,8 @@ struct GPUHistMakerDevice {
     this->monitor.Stop(__func__);
   }
 
-  void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
+  template <typename Iter>
+  void UpdatePositionColumnSplit(EllpackAccessorImpl<Iter> d_matrix,
                                  std::vector<NodeSplitData> const& split_data,
                                  std::vector<bst_node_t> const& nidx,
                                  std::vector<bst_node_t> const& left_nidx,
@@ -435,8 +445,9 @@ struct GPUHistMakerDevice {
         });
   }
 
+  template <typename Accessor>
   struct GoLeftOp {
-    EllpackDeviceAccessor d_matrix;
+    Accessor d_matrix;
 
     __device__ bool operator()(cuda_impl::RowIndexT ridx, NodeSplitData const& data) const {
       RegTree::Node const& node = data.split_node;
@@ -474,6 +485,15 @@ struct GPUHistMakerDevice {
     return n_samples * kNeedCopyThreshold > n_total_samples;
   }
 
+  template <typename Accessor>
+  struct GoLeftWrapperOp {
+    GoLeftOp<Accessor> go_left;
+    __device__ bool operator()(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
+                               const NodeSplitData& data) const {
+      return go_left(ridx, data);
+    }
+  };
+
   // Update position and build histogram.
   void PartitionAndBuildHist(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& expand_set,
                              std::vector<GPUExpandEntry> const& candidates, RegTree const* p_tree) {
@@ -489,8 +509,7 @@ struct GPUHistMakerDevice {
     bool const is_single_block = p_fmat->SingleColBlock();
 
     // Prepare for update partition
-    auto [nidx, left_nidx, right_nidx, split_data] =
-        this->CreatePartitionNodes(p_tree, is_single_block ? candidates : expand_set);
+    auto nodes = this->CreatePartitionNodes(p_tree, is_single_block ? candidates : expand_set);
 
     // Prepare for build hist
     std::vector<bst_node_t> build_nidx(candidates.size());
@@ -504,26 +523,26 @@ struct GPUHistMakerDevice {
 
     std::int32_t k{0};
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
-      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_);
-      auto go_left = GoLeftOp{d_matrix};
-
-      // Partition histogram.
-      monitor.Start("UpdatePositionBatch");
-      if (p_fmat->Info().IsColumnSplit()) {
-        UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
-      } else {
-        partitioners_.at(k)->UpdatePositionBatch(
-            ctx_, nidx, left_nidx, right_nidx, split_data,
-            [=] __device__(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
-                           const NodeSplitData& data) { return go_left(ridx, data); });
-      }
-
-      monitor.Stop("UpdatePositionBatch");
-
-      for (auto nidx : build_nidx) {
-        this->BuildHist(page, k, nidx);
-      }
+      page.Impl()->Visit(ctx_, {}, [&](auto&& d_matrix) {
+        using Acc = std::remove_reference_t<decltype(d_matrix)>;
+        auto go_left = GoLeftOp<Acc>{d_matrix};
+
+        // Partition histogram.
+        monitor.Start("UpdatePositionBatch");
+        if (p_fmat->Info().IsColumnSplit()) {
+          UpdatePositionColumnSplit(d_matrix, nodes.split_data, nodes.nidx, nodes.left_nidx,
+                                    nodes.right_nidx);
+        } else {
+          partitioners_.at(k)->UpdatePositionBatch(ctx_, nodes.nidx, nodes.left_nidx,
+                                                   nodes.right_nidx, nodes.split_data,
+                                                   GoLeftWrapperOp<Acc>{go_left});
+        }
+        monitor.Stop("UpdatePositionBatch");
 
+        for (auto nidx : build_nidx) {
+          this->BuildHist(page, k, nidx);
+        }
+      });
       ++k;
     }
 
@@ -534,6 +553,32 @@ struct GPUHistMakerDevice {
     monitor.Stop(__func__);
   }
 
+  struct EncodeOp {
+    common::Span<GradientPair const> d_gpair;
+    __device__ bst_node_t operator()(bst_idx_t ridx, bst_node_t nidx) const {
+      bool is_invalid = d_gpair[ridx].GetHess() - .0f == 0.f;
+      return SamplePosition::Encode(nidx, !is_invalid);
+    }
+  };
+
+  template <typename Accessor>
+  struct FinalizeOp {
+    common::Span<NodeSplitData> s_split_data;
+    GoLeftOp<Accessor> go_left_op;
+    EncodeOp encode_op;
+
+    __device__ auto operator()(bst_idx_t row_id, bst_node_t nidx) const {
+      auto split_data = s_split_data[nidx];
+      auto node = split_data.split_node;
+      while (!node.IsLeaf()) {
+        auto go_left = go_left_op(row_id, split_data);
+        nidx = go_left ? node.LeftChild() : node.RightChild();
+        node = s_split_data[nidx].split_node;
+      }
+      return encode_op(row_id, nidx);
+    }
+  };
+
   // After tree update is finished, update the position of all training
   // instances to their final leaf. This information is used later to update the
   // prediction cache
@@ -556,10 +601,6 @@ struct GPUHistMakerDevice {
     auto d_out_position = p_out_position->DeviceSpan();
 
     auto d_gpair = this->gpair;
-    auto encode_op = [=] __device__(bst_idx_t ridx, bst_node_t nidx) {
-      bool is_invalid = d_gpair[ridx].GetHess() - .0f == 0.f;
-      return SamplePosition::Encode(nidx, !is_invalid);
-    };  // NOLINT
 
     if (!p_fmat->SingleColBlock()) {
       for (std::size_t k = 0; k < partitioners_.size(); ++k) {
@@ -568,7 +609,7 @@ struct GPUHistMakerDevice {
         auto base_ridx = batch_ptr_[k];
         auto n_samples = batch_ptr_.at(k + 1) - base_ridx;
         part->FinalisePosition(ctx_, d_out_position.subspan(base_ridx, n_samples), base_ridx,
-                               encode_op);
+                               EncodeOp{d_gpair});
       }
       dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream());
       monitor.Stop(__func__);
@@ -582,8 +623,6 @@ struct GPUHistMakerDevice {
     auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
 
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_, ft);
-
       std::vector<NodeSplitData> split_data(p_tree->NumNodes());
       auto const& tree = *p_tree;
       for (std::size_t i = 0, n = split_data.size(); i < n; ++i) {
@@ -593,24 +632,19 @@ struct GPUHistMakerDevice {
         split_data[i] = NodeSplitData{std::move(split_node), split_type, node_cats};
       }
 
-      auto go_left_op = GoLeftOp{d_matrix};
       dh::CachingDeviceUVector<NodeSplitData> d_split_data;
       dh::CopyTo(split_data, &d_split_data, this->ctx_->CUDACtx()->Stream());
       auto s_split_data = dh::ToSpan(d_split_data);
 
-      partitioners_.front()->FinalisePosition(ctx_, d_out_position, page.BaseRowId(),
-                                              [=] __device__(bst_idx_t row_id, bst_node_t nidx) {
-                                                auto split_data = s_split_data[nidx];
-                                                auto node = split_data.split_node;
-                                                while (!node.IsLeaf()) {
-                                                  auto go_left = go_left_op(row_id, split_data);
-                                                  nidx = go_left ? node.LeftChild()
-                                                                 : node.RightChild();
-                                                  node = s_split_data[nidx].split_node;
-                                                }
-                                                return encode_op(row_id, nidx);
-                                              });
-      dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream());
+      page.Impl()->Visit(ctx_, ft, [&](auto&& d_matrix) {
+        auto go_left_op = GoLeftOp<std::remove_reference_t<decltype(d_matrix)>>{d_matrix};
+        partitioners_.front()->FinalisePosition(
+            ctx_, d_out_position, page.BaseRowId(),
+            FinalizeOp<std::remove_reference_t<decltype(d_matrix)>>{s_split_data, go_left_op,
+                                                                    EncodeOp{d_gpair}});
+
+        dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream());
+      });
     }
     monitor.Stop(__func__);
   }
diff --git a/tests/cpp/common/test_gpu_compressed_iterator.cu b/tests/cpp/common/test_gpu_compressed_iterator.cu
index 779202a62002..65ac036153e3 100644
--- a/tests/cpp/common/test_gpu_compressed_iterator.cu
+++ b/tests/cpp/common/test_gpu_compressed_iterator.cu
@@ -1,23 +1,31 @@
-#include "../../../src/common/compressed_iterator.h"
-#include "../../../src/common/device_helpers.cuh"
-#include "gtest/gtest.h"
-#include <algorithm>
+/**
+ * Copyright 2018-2025, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
 #include <thrust/device_vector.h>
+#include <thrust/sequence.h>  // for sequence
+
+#include <algorithm>  // for generate
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, uint32_t
+#include <vector>     // for vector
 
-namespace xgboost {
-namespace common {
+#include "../../../src/common/compressed_iterator.h"
+#include "../../../src/common/cuda_context.cuh"    // for CUDAContext
+#include "../../../src/common/device_helpers.cuh"  // for LaunchN
+#include "../../../src/common/device_vector.cuh"   // for DeviceUVector
+#include "../helpers.h"
 
+namespace xgboost::common {
 struct WriteSymbolFunction {
   CompressedBufferWriter cbw;
   unsigned char* buffer_data_d;
-  int* input_data_d;
+  int const* input_data_d;
   WriteSymbolFunction(CompressedBufferWriter cbw, unsigned char* buffer_data_d,
-                      int* input_data_d)
-    : cbw(cbw), buffer_data_d(buffer_data_d), input_data_d(input_data_d) {}
+                      int const* input_data_d)
+      : cbw(cbw), buffer_data_d(buffer_data_d), input_data_d(input_data_d) {}
 
-  __device__ void operator()(size_t i) {
-    cbw.AtomicWriteSymbol(buffer_data_d, input_data_d[i], i);
-  }
+  __device__ void operator()(size_t i) { cbw.AtomicWriteSymbol(buffer_data_d, input_data_d[i], i); }
 };
 
 struct ReadSymbolFunction {
@@ -70,5 +78,72 @@ TEST(CompressedIterator, TestGPU) {
   }
 }
 
-}  // namespace common
-}  // namespace xgboost
+namespace {
+class TestDoubleCompressedIter : public ::testing::TestWithParam<std::size_t> {
+ public:
+  constexpr std::size_t static CompressedBytes() { return 24; }
+
+ private:
+  dh::DeviceUVector<std::int32_t> input_;
+  Context ctx_{MakeCUDACtx(0)};
+  std::size_t n_symbols_{11};
+
+  void SetUp() override {
+    input_.resize(n_symbols_ * 3);
+    auto policy = ctx_.CUDACtx()->CTP();
+    for (std::size_t i = 0; i < 3; ++i) {
+      auto beg = input_.begin() + n_symbols_ * i;
+      auto end = beg + n_symbols_;
+      thrust::sequence(policy, beg, end, 0);
+    }
+  }
+
+ public:
+  void Run(std::size_t n0_bytes) const {
+    auto policy = ctx_.CUDACtx()->CTP();
+
+    auto compressed_nbytes = CompressedBufferWriter::CalculateBufferSize(input_.size(), n_symbols_);
+    ASSERT_EQ(compressed_nbytes, CompressedBytes());
+
+    dh::device_vector<CompressedByteT> buf(compressed_nbytes, 0);
+    CompressedBufferWriter cbw(n_symbols_);
+    dh::LaunchN(input_.size(), ctx_.CUDACtx()->Stream(),
+                WriteSymbolFunction{cbw, buf.data().get(), input_.data()});
+
+    dh::device_vector<CompressedByteT> buf0(n0_bytes);
+    dh::device_vector<CompressedByteT> buf1(compressed_nbytes - buf0.size());
+    thrust::copy_n(policy, buf.begin(), buf0.size(), buf0.begin());
+    thrust::copy_n(policy, buf.begin() + buf0.size(), buf1.size(), buf1.begin());
+
+    HostDeviceVector<std::int32_t> output(input_.size(), 0, ctx_.Device());
+    auto it = DoubleCompressedIter<std::uint32_t>{buf0.data().get(), buf0.size(), buf1.data().get(),
+                                                  n_symbols_};
+    auto d_out = output.DeviceSpan();
+    dh::LaunchN(input_.size(), ctx_.CUDACtx()->Stream(),
+                [=] __device__(std::size_t i) { d_out[i] = it[i]; });
+    auto h_out = output.ConstHostVector();
+    for (std::size_t i = 0; i < 3; ++i) {
+      auto beg = h_out.begin() + n_symbols_ * i;
+      auto end = beg + n_symbols_;
+      std::size_t k = 0;
+      for (auto it = beg; it != end; ++it) {
+        ASSERT_EQ(*it, k);
+        k++;
+      }
+    }
+  }
+};
+
+inline auto kCnBytes = TestDoubleCompressedIter::CompressedBytes();
+}  // namespace
+
+TEST_P(TestDoubleCompressedIter, Basic) {
+  auto n0_bytes = this->GetParam();
+  this->Run(n0_bytes);
+}
+
+INSTANTIATE_TEST_SUITE_P(Gpu, TestDoubleCompressedIter,
+                         ::testing::Values(0, kCnBytes, 1, kCnBytes - 1, kCnBytes / 2, kCnBytes / 3,
+                                           kCnBytes / 4, kCnBytes / 6, kCnBytes / 8,
+                                           kCnBytes / 12));
+}  // namespace xgboost::common
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index a8543e69adcb..274cda0107bd 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost contributors
+ * Copyright 2019-2025, XGBoost contributors
  */
 #include <xgboost/base.h>
 
@@ -29,15 +29,13 @@ TEST(EllpackPage, EmptyDMatrix) {
   auto impl = page.Impl();
   ASSERT_EQ(impl->info.row_stride, 0);
   ASSERT_EQ(impl->Cuts().TotalBins(), 0);
-  ASSERT_EQ(impl->gidx_buffer.size(), 4);
+  ASSERT_EQ(impl->gidx_buffer.size(), 5);
 }
 
 TEST(EllpackPage, BuildGidxDense) {
   bst_idx_t n_samples = 16, n_features = 8;
   auto ctx = MakeCUDACtx(0);
   auto page = BuildEllpackPage(&ctx, n_samples, n_features);
-  std::vector<common::CompressedByteT> h_gidx_buffer;
-  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
   ASSERT_EQ(page->info.row_stride, n_features);
 
@@ -59,13 +57,16 @@ TEST(EllpackPage, BuildGidxDense) {
     2, 4, 8, 10, 14, 15, 19, 22,
     1, 4, 7, 10, 14, 16, 19, 21,
   };
-  for (size_t i = 0; i < n_samples * n_features; ++i) {
-    auto fidx = i % n_features;
-    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]);
-  }
+
+  page->VisitOnHost(&ctx, [&](auto&& h_accessor) {
+    for (size_t i = 0; i < n_samples * n_features; ++i) {
+      auto fidx = i % n_features;
+      ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]);
+      ASSERT_EQ(page->NumSymbols(), h_accessor.NullValue());
+    }
+  });
   ASSERT_EQ(page->NumSymbols(), 3);
   ASSERT_EQ(page->NumNonMissing(&ctx, {}), n_samples * n_features);
-  ASSERT_EQ(page->NumSymbols(), h_accessor.NullValue());
 }
 
 TEST(EllpackPage, BuildGidxSparse) {
@@ -73,9 +74,6 @@ TEST(EllpackPage, BuildGidxSparse) {
   auto ctx = MakeCUDACtx(0);
   auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f);
 
-  std::vector<common::CompressedByteT> h_gidx_buffer;
-  auto h_acc = page->GetHostAccessor(&ctx, &h_gidx_buffer);
-
   ASSERT_EQ(page->info.row_stride, 3);
 
   // row_stride = 3, 16 rows, 48 entries for ELLPack
@@ -84,9 +82,11 @@ TEST(EllpackPage, BuildGidxSparse) {
     24, 24, 24, 24, 24,  5, 24, 24,  0, 16, 24, 15, 24, 24, 24, 24,
     24,  7, 14, 16,  4, 24, 24, 24, 24, 24,  9, 24, 24,  1, 24, 24
   };
-  for (size_t i = 0; i < kNRows * page->info.row_stride; ++i) {
-    ASSERT_EQ(solution[i], h_acc.gidx_iter[i]);
-  }
+  page->VisitOnHost(&ctx, [&](auto&& h_acc) {
+    for (size_t i = 0; i < kNRows * page->info.row_stride; ++i) {
+      ASSERT_EQ(solution[i], h_acc.gidx_iter[i]);
+    }
+  });
 }
 
 TEST(EllpackPage, FromCategoricalBasic) {
@@ -101,31 +101,30 @@ TEST(EllpackPage, FromCategoricalBasic) {
   auto ctx = MakeCUDACtx(0);
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(&ctx);
-  ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
   std::sort(x_copy.begin(), x_copy.end());
   auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin();
   ASSERT_EQ(n_uniques, kCats);
 
-  std::vector<uint32_t> h_cuts_ptr(accessor.NumFeatures() + 1);
-  dh::safe_cuda(cudaMemcpyAsync(h_cuts_ptr.data(), accessor.feature_segments,
-                                sizeof(bst_feature_t) * h_cuts_ptr.size(), cudaMemcpyDefault));
-  std::vector<float> h_cuts_values(accessor.gidx_fvalue_map.size());
-  dh::CopyDeviceSpanToVector(&h_cuts_values, accessor.gidx_fvalue_map);
-
-  ASSERT_EQ(h_cuts_ptr.size(), 2);
-  ASSERT_EQ(h_cuts_values.size(), kCats);
-
-  std::vector<common::CompressedByteT> h_gidx_buffer;
-  auto h_accessor = ellpack.Impl()->GetHostAccessor(&ctx, &h_gidx_buffer);
-
-  for (size_t i = 0; i < x.size(); ++i) {
-    auto bin = h_accessor.gidx_iter[i];
-    auto bin_value = h_cuts_values.at(bin);
-    ASSERT_EQ(AsCat(x[i]), AsCat(bin_value));
-  }
+  ellpack.Impl()->Visit(&ctx, {}, [&](auto&& accessor) {
+    ASSERT_EQ(kCats, accessor.NumBins());
+    std::vector<uint32_t> h_cuts_ptr(accessor.NumFeatures() + 1);
+    dh::safe_cuda(cudaMemcpyAsync(h_cuts_ptr.data(), accessor.feature_segments,
+                                  sizeof(bst_feature_t) * h_cuts_ptr.size(), cudaMemcpyDefault));
+    std::vector<float> h_cuts_values(accessor.gidx_fvalue_map.size());
+    dh::CopyDeviceSpanToVector(&h_cuts_values, accessor.gidx_fvalue_map);
+    ASSERT_EQ(h_cuts_ptr.size(), 2);
+    ASSERT_EQ(h_cuts_values.size(), kCats);
+
+    ellpack.Impl()->VisitOnHost(&ctx, [&](auto&& h_accessor) {
+      for (size_t i = 0; i < x.size(); ++i) {
+        auto bin = h_accessor.gidx_iter[i];
+        auto bin_value = h_cuts_values.at(bin);
+        ASSERT_EQ(AsCat(x[i]), AsCat(bin_value));
+      }
+    });
+  });
 }
 
 TEST(EllpackPage, FromCategoricalMissing) {
@@ -146,24 +145,24 @@ TEST(EllpackPage, FromCategoricalMissing) {
   }
   cuts->SetDevice(ctx.Device());
   for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
-    std::vector<common::CompressedByteT> h_buffer;
-    auto h_acc = page.Impl()->GetHostAccessor(&ctx, &h_buffer,
-                                              p_fmat->Info().feature_types.ConstDeviceSpan());
-    ASSERT_EQ(h_acc.n_rows, 2);
-    ASSERT_EQ(cuts->NumFeatures(), 3);
-    ASSERT_EQ(h_acc.row_stride, 2);
-    ASSERT_EQ(h_acc.gidx_iter[0], 0);
-    ASSERT_EQ(h_acc.gidx_iter[1], 4);  // cat 1
-    ASSERT_EQ(h_acc.gidx_iter[2], 1);
-    ASSERT_EQ(h_acc.gidx_iter[3], 3);  // cat 0
+    page.Impl()->VisitOnHost(&ctx, [&](auto&& h_acc) {
+      ASSERT_EQ(h_acc.n_rows, 2);
+      ASSERT_EQ(cuts->NumFeatures(), 3);
+      ASSERT_EQ(h_acc.row_stride, 2);
+      ASSERT_EQ(h_acc.gidx_iter[0], 0);
+      ASSERT_EQ(h_acc.gidx_iter[1], 4);  // cat 1
+      ASSERT_EQ(h_acc.gidx_iter[2], 1);
+      ASSERT_EQ(h_acc.gidx_iter[3], 3);  // cat 0
+    });
   }
 }
 
+template <typename Accessor>
 struct ReadRowFunction {
-  EllpackDeviceAccessor matrix;
-  int row;
+  Accessor matrix;
+  std::size_t row;
   bst_float* row_data_d;
-  ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
+  ReadRowFunction(Accessor matrix, std::size_t row, bst_float* row_data_d)
       : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
 
   __device__ void operator()(size_t col) {
@@ -206,12 +205,13 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols,
-                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
+      impl->Visit(&ctx, {}, [&](auto&& acc) {
+        dh::LaunchN(kCols, ReadRowFunction(acc, current_row, row_d.data().get()));
+      });
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
-
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), current_row,
-                                         row_result_d.data().get()));
+      result.Visit(&ctx, {}, [&](auto&& acc) {
+        dh::LaunchN(kCols, ReadRowFunction(acc, current_row, row_result_d.data().get()));
+      });
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
       EXPECT_EQ(row, row_result);
@@ -262,13 +262,16 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols,
-                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
+      impl->Visit(&ctx, {}, [&](auto&& acc) {
+        dh::LaunchN(kCols, ReadRowFunction{acc, current_row, row_d.data().get()});
+      });
+
       dh::safe_cuda(cudaDeviceSynchronize());
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), compacted_row,
-                                         row_result_d.data().get()));
+      result.Visit(&ctx, {}, [&](auto&& acc) {
+        dh::LaunchN(kCols, ReadRowFunction(acc, compacted_row, row_result_d.data().get()));
+      });
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
       EXPECT_EQ(row, row_result);
@@ -300,17 +303,18 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
     ASSERT_EQ(impl.NumSymbols(), batch.max_bin + 1);
 
     std::vector<common::CompressedByteT> h_gidx;
-    auto h_acc = impl.GetHostAccessor(ctx, &h_gidx);
-    ASSERT_EQ(h_acc.row_stride, h_acc.NumFeatures());
-    ASSERT_EQ(h_acc.NullValue(), batch.max_bin);
-    for (std::size_t i = 0; i < h_acc.row_stride * h_acc.n_rows; ++i) {
-      auto [m, n] = linalg::UnravelIndex(i, h_acc.n_rows, h_acc.row_stride);
-      if (n == null_column && m != 0) {
-        ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), h_acc.NullValue());
-      } else {
-        ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), m);
+    impl.VisitOnHost(ctx, [&](auto&& h_acc) {
+      ASSERT_EQ(h_acc.row_stride, h_acc.NumFeatures());
+      ASSERT_EQ(h_acc.NullValue(), batch.max_bin);
+      for (std::size_t i = 0; i < h_acc.row_stride * h_acc.n_rows; ++i) {
+        auto [m, n] = linalg::UnravelIndex(i, h_acc.n_rows, h_acc.row_stride);
+        if (n == null_column && m != 0) {
+          ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), h_acc.NullValue());
+        } else {
+          ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), m);
+        }
       }
-    }
+    });
   }
 
  public:
@@ -438,11 +442,15 @@ class SparseEllpack : public testing::TestWithParam<float> {
       ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size());
       ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
       std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
-      auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
-      auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
-      for (size_t i = 0; i < from_ghist->n_rows * from_ghist->info.row_stride; ++i) {
-        ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
-      }
+      auto from_ghist_acc = from_ghist->GetHostEllpack(&gpu_ctx, &h_gidx_from_ghist);
+      auto from_sparse_acc = from_sparse_page->GetHostEllpack(&gpu_ctx, &h_gidx_from_sparse);
+      std::visit(
+          [&](auto&& from_ghist_acc, auto&& from_sparse_acc) {
+            for (size_t i = 0; i < from_ghist->n_rows * from_ghist->info.row_stride; ++i) {
+              ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
+            }
+          },
+          from_ghist_acc, from_sparse_acc);
     }
   }
 
@@ -473,22 +481,24 @@ TEST(EllpackPage, IsDense) {
     auto p = BatchParam{16, tree::TrainParam::DftSparseThreshold()};
     auto ctx = MakeCUDACtx(0);
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
-      auto d_acc = page.Impl()->GetDeviceAccessor(&ctx);
-      if (sparsity == 0.0) {
-        ASSERT_EQ(d_acc.IsDense(), page.Impl()->IsDense());
-        ASSERT_TRUE(d_acc.IsDense());
-        ASSERT_EQ(p.max_bin, d_acc.NullValue());
-      } else {
-        ASSERT_FALSE(d_acc.IsDense());
-        ASSERT_EQ(p.max_bin * p_fmat->Info().num_col_, d_acc.NullValue());
-      }
-      std::vector<common::CompressedByteT> h_storage;
-      auto h_acc = page.Impl()->GetHostAccessor(&ctx, &h_storage);
-      if (sparsity == 0.0) {
-        ASSERT_TRUE(h_acc.IsDense());
-      } else {
-        ASSERT_FALSE(h_acc.IsDense());
-      }
+      page.Impl()->Visit(&ctx, {}, [&](auto&& d_acc) {
+        if (sparsity == 0.0) {
+          ASSERT_EQ(d_acc.IsDense(), page.Impl()->IsDense());
+          ASSERT_TRUE(d_acc.IsDense());
+          ASSERT_EQ(p.max_bin, d_acc.NullValue());
+        } else {
+          ASSERT_FALSE(d_acc.IsDense());
+          ASSERT_EQ(p.max_bin * p_fmat->Info().num_col_, d_acc.NullValue());
+        }
+      });
+
+      page.Impl()->VisitOnHost(&ctx, [&](auto&& h_acc) {
+        if (sparsity == 0.0) {
+          ASSERT_TRUE(h_acc.IsDense());
+        } else {
+          ASSERT_FALSE(h_acc.IsDense());
+        }
+      });
     }
   };
   test(0.0);
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 002f794690a1..752bc68d0ebb 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -81,8 +81,8 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam<bool> {
       ASSERT_EQ(loaded->base_rowid, orig->base_rowid);
       ASSERT_EQ(loaded->info.row_stride, orig->info.row_stride);
       std::vector<common::CompressedByteT> h_loaded, h_orig;
-      [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded);
-      [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig);
+      [[maybe_unused]] auto h_loaded_acc = loaded->GetHostEllpack(&ctx, &h_loaded);
+      [[maybe_unused]] auto h_orig_acc = orig->GetHostEllpack(&ctx, &h_orig);
       ASSERT_EQ(h_loaded, h_orig);
     }
   }
@@ -149,16 +149,20 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
       auto p_fmat = RandomDataGenerator{100, 14, 0.5}.Seed(i).GenerateDMatrix();
       for (auto const &orig : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
         std::vector<common::CompressedByteT> h_orig;
-        auto h_acc_orig = orig.Impl()->GetHostAccessor(&ctx, &h_orig, {});
+        auto h_acc_orig = orig.Impl()->GetHostEllpack(&ctx, &h_orig, {});
         std::vector<common::CompressedByteT> h_page;
-        auto h_acc = page.Impl()->GetHostAccessor(&ctx, &h_page, {});
+        auto h_acc = page.Impl()->GetHostEllpack(&ctx, &h_page, {});
         ASSERT_EQ(h_orig, h_page);
-        ASSERT_EQ(h_acc_orig.NumFeatures(), h_acc.NumFeatures());
-        ASSERT_EQ(h_acc_orig.row_stride, h_acc.row_stride);
-        ASSERT_EQ(h_acc_orig.n_rows, h_acc.n_rows);
-        ASSERT_EQ(h_acc_orig.base_rowid, h_acc.base_rowid);
-        ASSERT_EQ(h_acc_orig.IsDenseCompressed(), h_acc.IsDenseCompressed());
-        ASSERT_EQ(h_acc_orig.NullValue(), h_acc.NullValue());
+        std::visit(
+            [&](auto &&h_acc_orig, auto &&h_acc) {
+              ASSERT_EQ(h_acc_orig.NumFeatures(), h_acc.NumFeatures());
+              ASSERT_EQ(h_acc_orig.row_stride, h_acc.row_stride);
+              ASSERT_EQ(h_acc_orig.n_rows, h_acc.n_rows);
+              ASSERT_EQ(h_acc_orig.base_rowid, h_acc.base_rowid);
+              ASSERT_EQ(h_acc_orig.IsDenseCompressed(), h_acc.IsDenseCompressed());
+              ASSERT_EQ(h_acc_orig.NullValue(), h_acc.NullValue());
+            },
+            h_acc_orig, h_acc);
       }
     }
   }
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index 1b52ae0827d6..64aa92691dcf 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -7,9 +7,8 @@
 #include <tuple>   // for tuple
 #include <vector>  // for vector
 
-#include "../../../src/data/batch_utils.h"              // for AutoHostRatio
+#include "../../../src/data/batch_utils.h"              // for AutoHostRatio, DftHostRatio
 #include "../../../src/data/ellpack_page.cuh"           // for EllpackPageImpl
-#include "../../../src/data/extmem_quantile_dmatrix.h"  // for DftHostRatio
 #include "../helpers.h"                                 // for RandomDataGenerator, GMockThrow
 #include "test_extmem_quantile_dmatrix.h"               // for TestExtMemQdmBasic
 
@@ -25,11 +24,15 @@ auto AssertEllpackEq(Context const* ctx, EllpackPageImpl const* lhs, EllpackPage
   ASSERT_EQ(lhs->Cuts().Ptrs(), rhs->Cuts().Ptrs());
 
   std::vector<common::CompressedByteT> h_buf, d_buf;
-  auto h_acc = rhs->GetHostAccessor(ctx, &h_buf);
-  auto d_acc = rhs->GetHostAccessor(ctx, &d_buf);
-  for (std::size_t i = 0; i < h_acc.n_rows * h_acc.row_stride; ++i) {
-    ASSERT_EQ(h_acc.gidx_iter[i], d_acc.gidx_iter[i]);
-  }
+  auto h_acc = rhs->GetHostEllpack(ctx, &h_buf);
+  auto d_acc = rhs->GetHostEllpack(ctx, &d_buf);
+  std::visit(
+      [&](auto&& h_acc, auto&& d_acc) {
+        for (std::size_t i = 0; i < h_acc.n_rows * h_acc.row_stride; ++i) {
+          ASSERT_EQ(h_acc.gidx_iter[i], d_acc.gidx_iter[i]);
+        }
+      },
+      h_acc, d_acc);
 }
 
 class ExtMemQuantileDMatrixGpu : public ::testing::TestWithParam<std::tuple<float, bool>> {
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 122991d33c20..77ec12d56002 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost contributors
+ * Copyright 2020-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -29,7 +29,7 @@ void TestEquivalent(float sparsity) {
     offset += num_elements;
   }
   std::vector<common::CompressedByteT> h_iter_buffer;
-  auto from_iter = page_concatenated->GetHostAccessor(&ctx, &h_iter_buffer);
+  auto from_iter = page_concatenated->GetHostEllpack(&ctx, &h_iter_buffer);
   ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
   ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
 
@@ -40,31 +40,45 @@ void TestEquivalent(float sparsity) {
   auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
     std::vector<common::CompressedByteT> h_data_buffer;
-    auto from_data = ellpack.Impl()->GetHostAccessor(&ctx, &h_data_buffer);
+    auto from_data = ellpack.Impl()->GetHostEllpack(&ctx, &h_data_buffer);
 
-    ASSERT_EQ(from_iter.gidx_fvalue_map.size(), from_data.gidx_fvalue_map.size());
-    for (size_t i = 0; i < from_iter.gidx_fvalue_map.size(); ++i) {
-      EXPECT_NEAR(from_iter.gidx_fvalue_map[i], from_data.gidx_fvalue_map[i], kRtEps);
-    }
-    ASSERT_EQ(from_iter.min_fvalue.size(), from_data.min_fvalue.size());
-    for (size_t i = 0; i < from_iter.min_fvalue.size(); ++i) {
-      ASSERT_NEAR(from_iter.min_fvalue[i], from_data.min_fvalue[i], kRtEps);
-    }
-    ASSERT_EQ(from_iter.NumFeatures(), from_data.NumFeatures());
-    for (size_t i = 0; i < from_iter.NumFeatures() + 1; ++i) {
-      ASSERT_EQ(from_iter.feature_segments[i], from_data.feature_segments[i]);
-    }
+    std::visit(
+        [](auto&& from_iter, auto&& from_data) {
+          ASSERT_EQ(from_iter.gidx_fvalue_map.size(), from_data.gidx_fvalue_map.size());
+          for (size_t i = 0; i < from_iter.gidx_fvalue_map.size(); ++i) {
+            EXPECT_NEAR(from_iter.gidx_fvalue_map[i], from_data.gidx_fvalue_map[i], kRtEps);
+          }
+          ASSERT_EQ(from_iter.min_fvalue.size(), from_data.min_fvalue.size());
+          for (size_t i = 0; i < from_iter.min_fvalue.size(); ++i) {
+            ASSERT_NEAR(from_iter.min_fvalue[i], from_data.min_fvalue[i], kRtEps);
+          }
+          ASSERT_EQ(from_iter.NumFeatures(), from_data.NumFeatures());
+          for (size_t i = 0; i < from_iter.NumFeatures() + 1; ++i) {
+            ASSERT_EQ(from_iter.feature_segments[i], from_data.feature_segments[i]);
+          }
+        },
+        from_iter, from_data);
 
     std::vector<common::CompressedByteT> buffer_from_iter, buffer_from_data;
-    auto data_iter = page_concatenated->GetHostAccessor(&ctx, &buffer_from_iter);
-    auto data_buf = ellpack.Impl()->GetHostAccessor(&ctx, &buffer_from_data);
+    auto data_iter = page_concatenated->GetHostEllpack(&ctx, &buffer_from_iter);
+    auto data_buf = ellpack.Impl()->GetHostEllpack(&ctx, &buffer_from_data);
     ASSERT_NE(buffer_from_data.size(), 0);
     ASSERT_NE(buffer_from_iter.size(), 0);
     CHECK_EQ(ellpack.Impl()->NumSymbols(), page_concatenated->NumSymbols());
-    CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride);
-    for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
-      CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]);
-    }
+
+    std::visit(
+        [](auto&& from_iter, auto&& from_data) {
+          CHECK_EQ(from_data.n_rows * from_data.row_stride,
+                   from_data.n_rows * from_iter.row_stride);
+        },
+        from_iter, from_data);
+    std::visit(
+        [](auto&& from_data, auto&& data_buf, auto&& data_iter) {
+          for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
+            CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]);
+          }
+        },
+        from_data, data_buf, data_iter);
   }
 }
 
@@ -84,8 +98,7 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
   for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
     n_batches ++;
     auto impl = ellpack.Impl();
-    std::vector<common::CompressedByteT> h_gidx;
-    auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
+
     auto cols = CudaArrayIterForTest::Cols();
     auto rows = CudaArrayIterForTest::Rows();
 
@@ -96,12 +109,15 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
     common::Span<float const> s_data{static_cast<float const*>(loaded.data), cols * rows};
     dh::CopyDeviceSpanToVector(&h_data, s_data);
 
-    auto cut_ptr = h_accessor.feature_segments;
-    for (auto i = 0ull; i < rows * cols; i++) {
-      int column_idx = i % cols;
-      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx),
-                h_accessor.gidx_iter[i] + cut_ptr[column_idx]);
-    }
+    impl->VisitOnHost(&ctx, [&](auto&& h_accessor) {
+      auto cut_ptr = h_accessor.feature_segments;
+      for (auto i = 0ull; i < rows * cols; i++) {
+        int column_idx = i % cols;
+        EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx),
+                  h_accessor.gidx_iter[i] + cut_ptr[column_idx]);
+      }
+    });
+
     EXPECT_EQ(m.Info().num_col_, cols);
     EXPECT_EQ(m.Info().num_row_, rows);
     EXPECT_EQ(m.Info().num_nonzero_, rows * cols);
@@ -137,15 +153,15 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
       *m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
            .begin();
   auto impl = ellpack.Impl();
-  std::vector<common::CompressedByteT> h_gidx;
-  auto h_acc = impl->GetHostAccessor(&ctx, &h_gidx);
-  // null values get placed after valid values in a row
-  ASSERT_FALSE(h_acc.IsDenseCompressed());
-  ASSERT_EQ(h_acc.row_stride, cols - 1);
-  ASSERT_EQ(h_acc.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue());
-  for (std::size_t i = 0; i < 7; ++i) {
-  ASSERT_NE(h_acc.gidx_iter[i], impl->GetDeviceAccessor(&ctx).NullValue());
-  }
+  impl->VisitOnHost(&ctx, [&](auto&& h_acc) {
+    // null values get placed after valid values in a row
+    ASSERT_FALSE(h_acc.IsDenseCompressed());
+    ASSERT_EQ(h_acc.row_stride, cols - 1);
+    ASSERT_EQ(h_acc.gidx_iter[7], impl->NullValue());
+    for (std::size_t i = 0; i < 7; ++i) {
+      ASSERT_NE(h_acc.gidx_iter[i], impl->NullValue());
+    }
+  });
 
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 110d0f898758..b4d0801a15d4 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <xgboost/data.h>  // for DMatrix
 
@@ -157,7 +157,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
 
   for (size_t i = 0; i < iterators.size(); ++i) {
     std::vector<common::CompressedByteT> h_buf;
-    [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf);
+    [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostEllpack(&ctx, &h_buf);
     ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector());
     // The last page is still kept in the DMatrix until Reset is called.
     if (i == iterators.size() - 1) {
@@ -226,9 +226,9 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
     ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4);
 
     std::vector<common::CompressedByteT> buffer;
-    [[maybe_unused]] auto h_acc = impl->GetHostAccessor(&ctx, &buffer);
+    [[maybe_unused]] auto h_acc = impl->GetHostEllpack(&ctx, &buffer);
     std::vector<common::CompressedByteT> buffer_ext;
-    [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostAccessor(&ctx, &buffer_ext);
+    [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostEllpack(&ctx, &buffer_ext);
     ASSERT_EQ(buffer, buffer_ext);
   }
 };
@@ -258,12 +258,13 @@ INSTANTIATE_TEST_SUITE_P(EllpackPageExt, TestEllpackPageExt, ::testing::ValuesIn
                            return ss.str();
                          });
 
+template <typename Accessor>
 struct ReadRowFunction {
-  EllpackDeviceAccessor matrix;
-  int row;
+  Accessor matrix;
+  bst_idx_t row;
   bst_float* row_data_d;
-  ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
-      : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
+  ReadRowFunction(Accessor matrix, bst_idx_t row, bst_float* row_data_d)
+      : matrix(std::move(matrix)), row{row}, row_data_d(row_data_d) {}
 
   __device__ void operator()(size_t col) {
     auto value = matrix.GetFvalue(row, col);
@@ -303,12 +304,15 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
     EXPECT_EQ(impl_ext->base_rowid, current_row);
 
     for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(kCols,
-                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
+      impl->Visit(&ctx, {}, [&](auto&& acc) {
+        dh::LaunchN(kCols, ReadRowFunction{acc, current_row, row_d.data().get()});
+      });
+
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
+      impl_ext->Visit(&ctx, {}, [&](auto&& acc) {
+        dh::LaunchN(kCols, ReadRowFunction{acc, current_row, row_ext_d.data().get()});
+      });
 
-      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(&ctx), current_row,
-                                         row_ext_d.data().get()));
       thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
 
       EXPECT_EQ(row, row_ext);
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 55d5ea2d27fb..b7b1344ae1b7 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -5,6 +5,7 @@
 #include <xgboost/context.h>  // for Context
 
 #include <memory>  // for unique_ptr
+#include <tuple>   // for tuple
 #include <vector>  // for vector
 
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
@@ -127,9 +128,11 @@ void TestBuildHist(bool use_shared_memory_histograms) {
                 feature_groups.DeviceAccessor(ctx.Device()), page->Cuts().TotalBins(),
                 !use_shared_memory_histograms);
   builder.AllocateHistograms(&ctx, {0});
-  builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
-                         feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
-                         row_partitioner->GetRows(0), builder.GetNodeHistogram(0), *quantiser);
+  page->Visit(&ctx, {}, [&](auto&& acc) {
+    builder.BuildHistogram(ctx.CUDACtx(), acc, feature_groups.DeviceAccessor(ctx.Device()),
+                           gpair.DeviceSpan(), row_partitioner->GetRows(0),
+                           builder.GetNodeHistogram(0), *quantiser);
+  });
 
   auto node_histogram = builder.GetNodeHistogram(0);
 
@@ -181,9 +184,10 @@ void TestDeterministicHistogram(bool is_dense, std::size_t shm_size, bool force_
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                   feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
-                           feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                           d_histogram, quantiser);
+    page->Visit(&ctx, {}, [&](auto&& acc) {
+      builder.BuildHistogram(ctx.CUDACtx(), acc, feature_groups.DeviceAccessor(ctx.Device()),
+                             gpair.DeviceSpan(), ridx, d_histogram, quantiser);
+    });
 
     std::vector<GradientPairInt64> histogram_h(num_bins);
     dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(),
@@ -197,9 +201,10 @@ void TestDeterministicHistogram(bool is_dense, std::size_t shm_size, bool force_
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                     feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
-                             feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                             d_new_histogram, quantiser);
+      page->Visit(&ctx, {}, [&](auto&& acc) {
+        builder.BuildHistogram(ctx.CUDACtx(), acc, feature_groups.DeviceAccessor(ctx.Device()),
+                               gpair.DeviceSpan(), ridx, d_new_histogram, quantiser);
+      });
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
       dh::safe_cuda(cudaMemcpy(new_histogram_h.data(), d_new_histogram.data(),
@@ -222,9 +227,10 @@ void TestDeterministicHistogram(bool is_dense, std::size_t shm_size, bool force_
       // Single group must use global memory.
       builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                     single_group.DeviceAccessor(ctx.Device()), num_bins, /*force_global=*/true);
-      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
-                             single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                             dh::ToSpan(baseline), quantiser);
+      page->Visit(&ctx, {}, [&](auto&& acc) {
+        builder.BuildHistogram(ctx.CUDACtx(), acc, single_group.DeviceAccessor(ctx.Device()),
+                               gpair.DeviceSpan(), ridx, dh::ToSpan(baseline), quantiser);
+      });
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
       dh::safe_cuda(cudaMemcpy(baseline_h.data(), baseline.data().get(),
@@ -296,9 +302,10 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                   single_group.DeviceAccessor(ctx.Device()), num_categories, false);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
-                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                           dh::ToSpan(cat_hist), quantiser);
+    page->Visit(&ctx, {}, [&](auto&& acc) {
+      builder.BuildHistogram(ctx.CUDACtx(), acc, single_group.DeviceAccessor(ctx.Device()),
+                             gpair.DeviceSpan(), ridx, dh::ToSpan(cat_hist), quantiser);
+    });
   }
 
   /**
@@ -313,9 +320,10 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                   single_group.DeviceAccessor(ctx.Device()), encode_hist.size(), false);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
-                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                           dh::ToSpan(encode_hist), quantiser);
+    page->Visit(&ctx, {}, [&](auto&& acc) {
+      builder.BuildHistogram(ctx.CUDACtx(), acc, single_group.DeviceAccessor(ctx.Device()),
+                             gpair.DeviceSpan(), ridx, dh::ToSpan(encode_hist), quantiser);
+    });
   }
 
   std::vector<GradientPairInt64> h_cat_hist(cat_hist.size());
@@ -417,17 +425,45 @@ TEST(Histogram, Quantiser) {
   }
 }
 namespace {
-class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<float, bool>> {
+enum CacheMode {
+  kNoCache = 0,
+  kCopy = 1,
+  kDirect = 2,
+};
+
+class HistogramExternalMemoryTest
+    : public ::testing::TestWithParam<std::tuple<float, bool, CacheMode>> {
  public:
-  void Run(float sparsity, bool force_global) {
+  void Run(float sparsity, bool force_global, CacheMode cache_mode) {
+    auto ctx = MakeCUDACtx(0);
     bst_idx_t n_samples{512}, n_features{12}, n_batches{3};
     std::vector<std::unique_ptr<RowPartitioner>> partitioners;
-    auto p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
-                      .Batches(n_batches)
-                      .GenerateSparsePageDMatrix("cache", true);
+    auto rng = RandomDataGenerator{n_samples, n_features, sparsity}.Batches(n_batches);
     bst_bin_t n_bins = 16;
+    std::shared_ptr<DMatrix> p_fmat;
+    switch (cache_mode) {
+      case kCopy:
+      case kDirect: {
+        p_fmat = rng.CacheHostRatio(0.5)
+                     .Device(ctx.Device())
+                     .Bins(n_bins)
+                     .OnHost(true)
+                     .MinPageCacheBytes(n_bins * n_features)
+                     .GenerateExtMemQuantileDMatrix("cache", true);
+        break;
+      }
+      case kNoCache: {
+        p_fmat = rng.GenerateSparsePageDMatrix("cache", true);
+        break;
+      }
+    }
+
     BatchParam p{n_bins, TrainParam::DftSparseThreshold()};
-    auto ctx = MakeCUDACtx(0);
+    if (cache_mode == kDirect) {
+      p.prefetch_copy = false;
+    } else if (cache_mode == kCopy) {
+      p.prefetch_copy = true;
+    }
 
     std::unique_ptr<FeatureGroups> fg;
     dh::device_vector<GradientPairInt64> single_hist;
@@ -438,6 +474,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
     auto quantiser = GradientQuantiser{&ctx, gpair.ConstDeviceSpan(), p_fmat->Info()};
     std::shared_ptr<common::HistogramCuts> cuts;
 
+    std::size_t row_stride = 0;
     {
       /**
        * Multi page.
@@ -445,6 +482,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
       std::int32_t k{0};
       for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
         auto impl = page.Impl();
+        row_stride = impl->info.row_stride;
         if (k == 0) {
           // Initialization
           fg = std::make_unique<FeatureGroups>(impl->Cuts());
@@ -462,9 +500,10 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
         DeviceHistogramBuilder builder;
         builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                       fg->DeviceAccessor(ctx.Device()), d_histogram.size(), force_global);
-        builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(&ctx),
-                               fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
-                               d_histogram, quantiser);
+        impl->Visit(&ctx, {}, [&](auto&& acc) {
+          builder.BuildHistogram(ctx.CUDACtx(), acc, fg->DeviceAccessor(ctx.Device()),
+                                 gpair.ConstDeviceSpan(), ridx, d_histogram, quantiser);
+        });
         ++k;
       }
       ASSERT_EQ(k, n_batches);
@@ -477,20 +516,22 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
       RowPartitioner partitioner;
       partitioner.Reset(&ctx, p_fmat->Info().num_row_, 0);
 
-      SparsePage concat;
+      auto concat = EllpackPageImpl(&ctx, cuts, sparsity == 0.0, row_stride, n_samples);
       std::vector<float> hess(p_fmat->Info().num_row_, 1.0f);
-      for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
-        concat.Push(page);
+      std::size_t offset = 0;
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
+        bst_idx_t num_elements = concat.Copy(&ctx, page.Impl(), offset);
+        offset += num_elements;
       }
-      EllpackPageImpl page{&ctx, cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
       auto ridx = partitioner.GetRows(0);
       auto d_histogram = dh::ToSpan(single_hist);
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(), fg->DeviceAccessor(ctx.Device()),
                     d_histogram.size(), force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(&ctx),
-                             fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
-                             d_histogram, quantiser);
+      concat.Visit(&ctx, {}, [&](auto&& acc) {
+        builder.BuildHistogram(ctx.CUDACtx(), acc, fg->DeviceAccessor(ctx.Device()),
+                               gpair.ConstDeviceSpan(), ridx, d_histogram, quantiser);
+      });
     }
 
     std::vector<GradientPairInt64> h_single(single_hist.size());
@@ -499,7 +540,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
     thrust::copy(multi_hist.begin(), multi_hist.end(), h_multi.begin());
 
     for (std::size_t i = 0; i < single_hist.size(); ++i) {
-      ASSERT_EQ(h_single[i].GetQuantisedGrad(), h_multi[i].GetQuantisedGrad());
+      ASSERT_EQ(h_single[i].GetQuantisedGrad(), h_multi[i].GetQuantisedGrad()) << i;
       ASSERT_EQ(h_single[i].GetQuantisedHess(), h_multi[i].GetQuantisedHess());
     }
   }
@@ -510,7 +551,25 @@ TEST_P(HistogramExternalMemoryTest, ExternalMemory) {
   std::apply(&HistogramExternalMemoryTest::Run, std::tuple_cat(std::make_tuple(this), GetParam()));
 }
 
-INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest,
-                         ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.8f),
-                                            ::testing::Bool()));
+INSTANTIATE_TEST_SUITE_P(
+    Histogram, HistogramExternalMemoryTest,
+    ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.8f), ::testing::Bool(),
+                       ::testing::Values(kNoCache, kDirect, kCopy)),
+    [](::testing::TestParamInfo<HistogramExternalMemoryTest::ParamType> const& info) {
+      std::stringstream ss;
+      auto const& p = info.param;
+      ss << "sparsity_0" << (std::get<0>(p) * 10) << "_global_" << std::get<1>(p) << "_dcache_";
+      switch (std::get<2>(p)) {
+        case kNoCache:
+          ss << "nocache";
+          break;
+        case kDirect:
+          ss << "direct";
+          break;
+        case kCopy:
+          ss << "copy";
+          break;
+      }
+      return ss.str();
+    });
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 5382c2c22b9a..2127f01f2130 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -119,6 +119,19 @@ void GetSplit(RegTree* tree, float split_value, std::vector<GPUExpandEntry>* can
   candidates->front().split.findex = 0;
 }
 
+namespace {
+template <typename Accessor>
+struct LessThanOp {
+  Accessor acc;
+  explicit LessThanOp(Accessor acc) : acc{acc} {}
+  __device__ bool operator()(bst_idx_t ridx, std::int32_t nidx_in_batch,
+                             RegTree::Node const& node) const {
+    auto fvalue = acc.GetFvalue(ridx, node.SplitIndex());
+    return fvalue <= node.SplitCond();
+  }
+};
+}  // namespace
+
 void TestExternalMemory() {
   auto ctx = MakeCUDACtx(0);
 
@@ -149,13 +162,9 @@ void TestExternalMemory() {
     partitioners.emplace_back(std::make_unique<RowPartitioner>());
     partitioners.back()->Reset(&ctx, page.Size(), page.BaseRowId());
     std::vector<RegTree::Node> splits{tree[0]};
-    auto acc = page.Impl()->GetDeviceAccessor(&ctx);
-    partitioners.back()->UpdatePositionBatch(
-        &ctx, {0}, {1}, {2}, splits,
-        [=] __device__(bst_idx_t ridx, std::int32_t nidx_in_batch, RegTree::Node const& node) {
-          auto fvalue = acc.GetFvalue(ridx, node.SplitIndex());
-          return fvalue <= node.SplitCond();
-        });
+    page.Impl()->Visit(&ctx, {}, [&](auto&& acc) {
+      partitioners.back()->UpdatePositionBatch(&ctx, {0}, {1}, {2}, splits, LessThanOp{acc});
+    });
     partitioners.back()->FinalisePosition(
         &ctx, dh::ToSpan(position).subspan(page.BaseRowId(), page.Size()), page.BaseRowId(),
         encode_op);

From 53ca5aa34a7c3f9726ac6c3562e1eeff6af4b014 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 19 May 2025 21:17:14 -0400
Subject: [PATCH 056/224] Fix VM fallback logic on WSL2 (#11471)

---
 src/common/cuda_dr_utils.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index f6bf0b14798c..8a14bf24b4c8 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -154,7 +154,8 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
   // Use the first GPU
   auto smi_ver = Split(TrimFirst(smi_split[1]), '.');
   // 570.124.06
-  if (smi_ver.size() != 3) {
+  // On WSL2, you can have driver version with two components, e.g. 573.24
+  if (smi_ver.size() != 2 && smi_ver.size() != 3) {
     return Invalid();
   }
   try {

From 6234b615a51c67193f001ea698ebcce7edb9d764 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 20 May 2025 16:15:20 +0800
Subject: [PATCH 057/224] [EM] Implement default cache split policy. (#11469)

---
 demo/guide-python/distributed_extmem_basic.py |  15 ++-
 demo/guide-python/external_memory.py          |  21 ++-
 doc/tutorials/external_memory.rst             | 126 +++++++++++-------
 src/common/common.h                           |  11 ++
 src/common/cuda_dr_utils.cc                   |  33 +++++
 src/common/cuda_dr_utils.h                    |  34 +++--
 src/common/io.cc                              |  11 +-
 src/data/batch_utils.cc                       |  95 +++++++++++--
 src/data/batch_utils.h                        |  28 +++-
 src/data/ellpack_page.cu                      |   9 +-
 src/data/ellpack_page_source.cu               |  34 ++++-
 src/data/ellpack_page_source.h                |   8 +-
 src/data/extmem_quantile_dmatrix.cu           |   7 +-
 src/data/sparse_page_dmatrix.cc               |   2 +-
 src/data/sparse_page_dmatrix.cu               |   3 +-
 tests/cpp/c_api/test_c_api.cc                 |   3 +-
 tests/cpp/common/test_common.cc               |  16 ++-
 tests/cpp/common/test_cuda_dr_utils.cc        |  32 +++++
 tests/cpp/common/test_io.cc                   |  25 ++--
 tests/cpp/data/test_batch_utils.cu            |  42 ++++++
 .../cpp/data/test_ellpack_page_raw_format.cu  |  22 +--
 .../cpp/data/test_extmem_quantile_dmatrix.cu  |  24 ++--
 tests/cpp/data/test_sparse_page_dmatrix.cc    |   2 +-
 tests/python-gpu/test_gpu_data_iterator.py    |   4 +-
 24 files changed, 451 insertions(+), 156 deletions(-)
 create mode 100644 tests/cpp/data/test_batch_utils.cu

diff --git a/demo/guide-python/distributed_extmem_basic.py b/demo/guide-python/distributed_extmem_basic.py
index 1b6a8b2c1b6f..55166a3faeef 100644
--- a/demo/guide-python/distributed_extmem_basic.py
+++ b/demo/guide-python/distributed_extmem_basic.py
@@ -36,6 +36,16 @@
 from xgboost.tracker import RabitTracker
 
 
+def device_mem_total() -> int:
+    """The total number of bytes of memory this GPU has."""
+    from cuda import cudart
+
+    status, free, total = cudart.cudaMemGetInfo()
+    if status != cudart.cudaError_t.cudaSuccess:
+        raise RuntimeError(cudart.cudaGetErrorString(status))
+    return total
+
+
 def make_batches(
     n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str, rank: int
 ) -> List[Tuple[str, str]]:
@@ -108,16 +118,13 @@ def setup_rmm() -> None:
 
     """
     import rmm
-    from cuda import cudart
     from rmm.allocators.cupy import rmm_cupy_allocator
     from rmm.mr import ArenaMemoryResource
 
     if not xgboost.build_info()["USE_RMM"]:
         return
 
-    status, free, total = cudart.cudaMemGetInfo()
-    if status != cudart.cudaError_t.cudaSuccess:
-        raise RuntimeError(cudart.cudaGetErrorString(status))
+    total = device_mem_total()
 
     mr = rmm.mr.CudaMemoryResource()
     mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index d076c5a848a9..7891447938c8 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -34,7 +34,7 @@
 import argparse
 import os
 import tempfile
-from typing import Callable, List, Tuple
+from typing import Callable, List, Literal, Tuple
 
 import numpy as np
 from sklearn.datasets import make_regression
@@ -42,6 +42,16 @@
 import xgboost
 
 
+def device_mem_total() -> int:
+    """The total number of bytes of memory this GPU has."""
+    from cuda import cudart
+
+    status, free, total = cudart.cudaMemGetInfo()
+    if status != cudart.cudaError_t.cudaSuccess:
+        raise RuntimeError(cudart.cudaGetErrorString(status))
+    return total
+
+
 def make_batches(
     n_samples_per_batch: int,
     n_features: int,
@@ -63,7 +73,9 @@ def make_batches(
 class Iterator(xgboost.DataIter):
     """A custom iterator for loading files in batches."""
 
-    def __init__(self, device: str, file_paths: List[Tuple[str, str]]) -> None:
+    def __init__(
+        self, device: Literal["cpu", "cuda"], file_paths: List[Tuple[str, str]]
+    ) -> None:
         self.device = device
 
         self._file_paths = file_paths
@@ -167,16 +179,13 @@ def setup_rmm() -> None:
     """
 
     import rmm
-    from cuda import cudart
     from rmm.allocators.cupy import rmm_cupy_allocator
     from rmm.mr import ArenaMemoryResource
 
     if not xgboost.build_info()["USE_RMM"]:
         return
 
-    status, free, total = cudart.cudaMemGetInfo()
-    if status != cudart.cudaError_t.cudaSuccess:
-        raise RuntimeError(cudart.cudaGetErrorString(status))
+    total = device_mem_total()
 
     mr = rmm.mr.CudaMemoryResource()
     mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index f9b848c0c337..e75f9429c824 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -67,47 +67,69 @@ training, the custom data iterator needs to have two class methods: ``next`` and
 
 .. code-block:: python
 
-  import os
-  from typing import List, Callable
-  import xgboost
-  from sklearn.datasets import load_svmlight_file
-
-  class Iterator(xgboost.DataIter):
-    def __init__(self, svm_file_paths: List[str]) -> None:
-      self._file_paths = svm_file_paths
-      self._it = 0
-      # XGBoost will generate some cache files under the current directory with the prefix
-      # "cache"
-      super().__init__(cache_prefix=os.path.join(".", "cache"))
-
-    def next(self, input_data: Callable) -> bool:
-      """Advance the iterator by 1 step and pass the data to XGBoost. This function is
-      called by XGBoost during the construction of ``DMatrix``
-
-      """
-      if self._it == len(self._file_paths):
-        # return False to let XGBoost know this is the end of the iteration
-        return False
-
-      # input_data is a function passed in by XGBoost and has the exact same signature of
-      # ``DMatrix``
-      X, y = load_svmlight_file(self._file_paths[self._it])
-      # Keyword-only arguments, see the ``DMatrix`` class for accepted arguments.
-      input_data(data=X, label=y)
-      self._it += 1
-      # Return True to let XGBoost know we haven't seen all the files yet.
-      return True
-
-    def reset(self) -> None:
-      """Reset the iterator to its beginning"""
-      self._it = 0
+    import os
+    from typing import List, Callable
+
+    import numpy as np
+    import xgboost
+
+    class Iterator(xgboost.DataIter):
+        """A custom iterator for loading files in batches."""
+
+        def __init__(
+            self, device: Literal["cpu", "cuda"], file_paths: List[Tuple[str, str]]
+        ) -> None:
+            self.device = device
+
+            self._file_paths = file_paths
+            self._it = 0
+            # XGBoost will generate some cache files under the current directory with the
+            # prefix "cache"
+            super().__init__(cache_prefix=os.path.join(".", "cache"))
+
+        def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
+            """Load a single batch of data."""
+            X_path, y_path = self._file_paths[self._it]
+            # When the `ExtMemQuantileDMatrix` is used, the device must match. GPU cannot
+            # consume CPU input data and vice-versa.
+            if self.device == "cpu":
+                X = np.load(X_path)
+                y = np.load(y_path)
+            else:
+                import cupy as cp
+
+                X = cp.load(X_path)
+                y = cp.load(y_path)
+
+            assert X.shape[0] == y.shape[0]
+            return X, y
+
+        def next(self, input_data: Callable) -> bool:
+            """Advance the iterator by 1 step and pass the data to XGBoost.  This function
+            is called by XGBoost during the construction of ``DMatrix``
+
+            """
+            if self._it == len(self._file_paths):
+                # return False to let XGBoost know this is the end of iteration
+                return False
+
+            # input_data is a keyword-only function passed in by XGBoost and has the similar
+            # signature to the ``DMatrix`` constructor.
+            X, y = self.load_file()
+            input_data(data=X, label=y)
+            self._it += 1
+            return True
+
+        def reset(self) -> None:
+            """Reset the iterator to its beginning"""
+            self._it = 0
 
 After defining the iterator, we can to pass it into the :py:class:`~xgboost.DMatrix` or
 the :py:class:`~xgboost.ExtMemQuantileDMatrix` constructor:
 
 .. code-block:: python
 
-  it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
+  it = Iterator(device="cpu", file_paths=["file_0.npy", "file_1.npy", "file_2.npy"])
 
   # Use the ``ExtMemQuantileDMatrix`` for the hist tree method, recommended.
   Xy = xgboost.ExtMemQuantileDMatrix(it)
@@ -170,22 +192,22 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
     # Make sure XGBoost is using RMM for all allocations.
     with xgboost.config_context(use_rmm=True):
         # Construct the iterators for ExtMemQuantileDMatrix
-	# ...
-	# Build the ExtMemQuantileDMatrix and start training
-	Xy_train = xgboost.ExtMemQuantileDMatrix(it_train, max_bin=n_bins)
-	# Use the training DMatrix as a reference
-	Xy_valid = xgboost.ExtMemQuantileDMatrix(it_valid, max_bin=n_bins, ref=Xy_train)
-	booster = xgboost.train(
-	    {
-		"tree_method": "hist",
-		"max_depth": 6,
-		"max_bin": n_bins,
-		"device": device,
-	    },
-	    Xy_train,
-	    num_boost_round=n_rounds,
-	    evals=[(Xy_train, "Train"), (Xy_valid, "Valid")]
-	)
+        # ...
+        # Build the ExtMemQuantileDMatrix and start training
+        Xy_train = xgboost.ExtMemQuantileDMatrix(it_train, max_bin=n_bins)
+        # Use the training DMatrix as a reference
+        Xy_valid = xgboost.ExtMemQuantileDMatrix(it_valid, max_bin=n_bins, ref=Xy_train)
+        booster = xgboost.train(
+            {
+                "tree_method": "hist",
+                "max_depth": 6,
+                "max_bin": n_bins,
+                "device": device,
+            },
+            Xy_train,
+            num_boost_round=n_rounds,
+            evals=[(Xy_train, "Train"), (Xy_valid, "Valid")]
+        )
 
 It's crucial to use `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ with
 an asynchronous memory resource for all memory allocation when training with external
@@ -408,6 +430,8 @@ undergone multiple development iterations. Here's a brief summary of major chang
   introduced the :py:class:`~xgboost.ExtMemQuantileDMatrix` class, added quantile-based
   objectives support.
 - In addition, we begin support for distributed training in 3.0
+- 3.1 added support for having divided cache pages. One can have part of a cache page in
+  the GPU and the rest of the cache in the host memory.
 
 ****************
 Text File Inputs
diff --git a/src/common/common.h b/src/common/common.h
index 57d98cdb0d0b..ddc36ab657fe 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -75,6 +75,17 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
   return str.substr(first);
 }
 
+[[nodiscard]] inline std::string TrimLast(std::string const &str) {
+  if (str.empty()) {
+    return str;
+  }
+  std::size_t last = str.find_last_not_of(" \t\n\r");
+  if (last == std::string::npos) {
+    return "";
+  }
+  return str.substr(0, last + 1);
+}
+
 /**
  * @brief Add escapes for a UTF-8 string.
  */
diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index 8a14bf24b4c8..e51a76e00a2d 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -187,5 +187,38 @@ void GetDrVersionGlobal(std::int32_t *p_major, std::int32_t *p_minor) {
   *p_major = major;
   *p_minor = minor;
 }
+
+namespace detail {
+// Split up an impl function for simple tests.
+[[nodiscard]] std::int32_t GetC2cLinkCountFromSmiImpl(std::string const &smi_output) {
+  using common::Split, common::TrimFirst, common::TrimLast;
+  auto smi_out_str = TrimLast(TrimFirst(smi_output));
+  auto lines = Split(smi_out_str, '\n');
+  if (lines.size() <= 1) {
+    return -1;
+  }
+  return lines.size() - 1;
+}
+}  // namespace detail
+
+[[nodiscard]] std::int32_t GetC2cLinkCountFromSmi() {
+  auto n_devices = curt::AllVisibleGPUs();
+  if (n_devices < 1) {
+    return -1;
+  }
+
+  // See test for example output from smi.
+  auto cmd = "nvidia-smi c2c -s -i 0";  // Select the first GPU to query.
+  auto out = common::CmdOutput(StringView{cmd});
+  auto cnt = detail::GetC2cLinkCountFromSmiImpl(out);
+  return cnt;
+}
+
+[[nodiscard]] std::int32_t GetC2cLinkCountFromSmiGlobal() {
+  static std::once_flag once;
+  static std::int32_t cnt = -1;
+  std::call_once(once, [&] { cnt = GetC2cLinkCountFromSmi(); });
+  return cnt;
+}
 }  // namespace xgboost::cudr
 #endif
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
index 5ce397b1ad40..9b9c1da5cd21 100644
--- a/src/common/cuda_dr_utils.h
+++ b/src/common/cuda_dr_utils.h
@@ -12,6 +12,7 @@
 #include <cuda_runtime_api.h>
 
 #include <cstdint>  // for int32_t
+#include <string>   // for string
 
 #include "xgboost/string_view.h"  // for StringView
 
@@ -66,15 +67,15 @@ struct CuDriverApi {
    * @param[in]  addr      - Fixed starting address range requested
    * @param[in]  flags     - Currently unused, must be zero
    */
-  MemAddressReserveFn *cuMemAddressReserve{nullptr};      // NOLINT
-  MemSetAccessFn *cuMemSetAccess{nullptr};                // NOLINT
-  MemUnmapFn *cuMemUnmap{nullptr};                        // NOLINT
-  MemReleaseFn *cuMemRelease{nullptr};                    // NOLINT
-  MemAddressFreeFn *cuMemAddressFree{nullptr};            // NOLINT
-  GetErrorString *cuGetErrorString{nullptr};              // NOLINT
-  GetErrorName *cuGetErrorName{nullptr};                  // NOLINT
-  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};      // NOLINT
-  DeviceGet *cuDeviceGet{nullptr};                        // NOLINT
+  MemAddressReserveFn *cuMemAddressReserve{nullptr};  // NOLINT
+  MemSetAccessFn *cuMemSetAccess{nullptr};            // NOLINT
+  MemUnmapFn *cuMemUnmap{nullptr};                    // NOLINT
+  MemReleaseFn *cuMemRelease{nullptr};                // NOLINT
+  MemAddressFreeFn *cuMemAddressFree{nullptr};        // NOLINT
+  GetErrorString *cuGetErrorString{nullptr};          // NOLINT
+  GetErrorName *cuGetErrorName{nullptr};              // NOLINT
+  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};  // NOLINT
+  DeviceGet *cuDeviceGet{nullptr};                    // NOLINT
 
 #if defined(CUDA_HW_DECOM_AVAILABLE)
 
@@ -134,4 +135,19 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc);
  * @brief Cache the result from @ref DrVersion in a global variable
  */
 void GetDrVersionGlobal(std::int32_t *p_major, std::int32_t *p_minor);
+
+namespace detail {
+[[nodiscard]] std::int32_t GetC2cLinkCountFromSmiImpl(std::string const &smi_output);
+}  // namespace detail
+
+/**
+ * @brief Get the total number of C2C links `NVML_FI_DEV_C2C_LINK_COUNT`.
+ *
+ * @return -1 if there's no C2C. Otherwise, the number of links.
+ */
+[[nodiscard]] std::int32_t GetC2cLinkCountFromSmi();
+/**
+ * @brief Cache the result from @ref GetC2cLinkCountFromSmi in a global variable
+ */
+[[nodiscard]] std::int32_t GetC2cLinkCountFromSmiGlobal();
 }  // namespace xgboost::cudr
diff --git a/src/common/io.cc b/src/common/io.cc
index c18ed32593ca..a01c31e629dc 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -49,8 +49,7 @@ size_t PeekableInStream::Read(void* dptr, size_t size) {
   if (nbuffer < size) {
     std::memcpy(dptr, dmlc::BeginPtr(buffer_) + buffer_ptr_, nbuffer);
     buffer_ptr_ += nbuffer;
-    return nbuffer + strm_->Read(reinterpret_cast<char*>(dptr) + nbuffer,
-                                 size - nbuffer);
+    return nbuffer + strm_->Read(reinterpret_cast<char*>(dptr) + nbuffer, size - nbuffer);
   } else {
     std::memcpy(dptr, dmlc::BeginPtr(buffer_) + buffer_ptr_, size);
     buffer_ptr_ += size;
@@ -97,7 +96,7 @@ size_t FixedSizeStream::Read(void* dptr, size_t size) {
 }
 
 size_t FixedSizeStream::PeekRead(void* dptr, size_t size) {
-  if (size >= buffer_.size() - pointer_)  {
+  if (size >= buffer_.size() - pointer_) {
     std::copy(buffer_.cbegin() + pointer_, buffer_.cend(), reinterpret_cast<char*>(dptr));
     return std::distance(buffer_.cbegin() + pointer_, buffer_.cend());
   } else {
@@ -337,13 +336,12 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
 
 [[nodiscard]] std::string CmdOutput(StringView cmd) {
 #if defined(xgboost_IS_WIN)
-  (void)cmd;
-  LOG(FATAL) << "Not implemented";
-  return "";
+  std::unique_ptr<FILE, std::function<int(FILE*)>> pipe(_popen(cmd.c_str(), "r"), _pclose);
 #else
   // popen is a convenient method, but it always returns a success even if the command
   // fails.
   std::unique_ptr<FILE, std::function<int(FILE*)>> pipe(popen(cmd.c_str(), "r"), pclose);
+#endif
   CHECK(pipe);
   std::array<char, 128> buffer;
   std::string result;
@@ -351,6 +349,5 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
     result += buffer.data();
   }
   return result;
-#endif
 }
 }  // namespace xgboost::common
diff --git a/src/data/batch_utils.cc b/src/data/batch_utils.cc
index 1ae54eb9738e..b41e8f7c90b3 100644
--- a/src/data/batch_utils.cc
+++ b/src/data/batch_utils.cc
@@ -3,7 +3,20 @@
  */
 #include "batch_utils.h"
 
-#include "../common/error_msg.h"  // for InconsistentMaxBin
+#include <algorithm>  // for max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int64_t
+#include <utility>    // for pair
+
+#include "../common/common.h"         // for AssertGPUSupport
+#include "../common/cuda_rt_utils.h"  // for TotalMemory
+#include "../common/error_msg.h"      // for InconsistentMaxBin
+
+#if defined(XGBOOST_USE_CUDA)
+
+#include "../common/cuda_dr_utils.h"  // for GetC2cLinkCountFromSmiGlobal
+
+#endif  // defined(XGBOOST_USE_CUDA)
 
 namespace xgboost::data::detail {
 void CheckParam(BatchParam const& init, BatchParam const& param) {
@@ -12,19 +25,77 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
       << "Only the `hist` tree method can use the `QuantileDMatrix`.";
 }
 
-[[nodiscard]] float DftHostRatio(float cache_host_ratio, bool is_validation) {
-  if (is_validation) {
-    // Don't split the cache if this is a validation dataset.
-    return 1.0;
+[[nodiscard]] std::pair<double, std::int64_t> DftPageSizeHostRatio(
+    std::size_t n_cache_bytes, bool is_validation, double cache_host_ratio,
+    std::int64_t min_cache_page_bytes) {
+  common::AssertGPUSupport();
+
+  if (!HostRatioIsAuto(cache_host_ratio)) {
+    // Use user config.
+    CHECK_GE(cache_host_ratio, 0.0f) << error::CacheHostRatioInvalid();
+    CHECK_LE(cache_host_ratio, 1.0f) << error::CacheHostRatioInvalid();
   }
-  if (HostRatioIsAuto(cache_host_ratio)) {
-    // Only NVML has the API to detect the topology. We will leave it as-is for now.
+
+#if defined(XGBOOST_USE_CUDA)
+  auto n_d_bytes = curt::TotalMemory();
+
+  using xgboost::cuda_impl::CachePageRatio;
+
+  auto lc = cudr::GetC2cLinkCountFromSmiGlobal();
+  if (lc >= 10) {
+    // >= 10, life is easy.
+    if (CachePageBytesIsAuto(min_cache_page_bytes)) {
+      min_cache_page_bytes = n_d_bytes * CachePageRatio();
+    }
+    if (HostRatioIsAuto(cache_host_ratio)) {
+      cache_host_ratio = 1.0;
+    }
+    return {cache_host_ratio, min_cache_page_bytes};
+  }
+
+  /**
+   * Configure the min_cache_page_bytes
+   */
+  // -1 if PCIe device, or something went wrong when running nvidia-smi
+  //
+  // GH200 1 CPU + 1 GPU has 10. For 1 CPU + 2 GPU, it's 5.
+  //
+  // Either way, we configure the cache based on the ratio between cache sizes and the
+  // available memory.
+  // Use half of the device memory for cache.
+  auto d_cache_nbytes = n_d_bytes / 2;
+
+  // Since half of the device is used for the cache, we have to use smaller page size.
+  if (CachePageBytesIsAuto(min_cache_page_bytes)) {
+    min_cache_page_bytes = n_d_bytes * (CachePageRatio() / 2.0);
+  }
+
+  /**
+   * Configure the ratio.
+   */
+  if (!HostRatioIsAuto(cache_host_ratio)) {
+    // Do nothing if it's provided by the user
+    return {cache_host_ratio, min_cache_page_bytes};
+  } else if (is_validation) {
+    // Use full host cache for the validation dataset.
     cache_host_ratio = 1.0;
-    return cache_host_ratio;
+  } else if (n_cache_bytes <= d_cache_nbytes) {
+    // The total size of the cache is smaller than the available device cache.
+    cache_host_ratio = 0.0;
+  } else {
+    // The number of bytes that must be in the host memory.
+    auto h_cache_nbytes = n_cache_bytes - d_cache_nbytes;
+    cache_host_ratio = static_cast<double>(h_cache_nbytes) / static_cast<double>(n_cache_bytes);
+    if (lc > 0) {
+      // 0 < lc < 10, C2C is available, but with reduced link count.
+      // No need to exceed half in practice.
+      cache_host_ratio = std::max(cache_host_ratio, 0.5);
+    }
   }
-  // Use user config.
-  CHECK_GE(cache_host_ratio, 0.0f) << error::CacheHostRatioInvalid();
-  CHECK_LE(cache_host_ratio, 1.0f) << error::CacheHostRatioInvalid();
-  return cache_host_ratio;
+#else
+  (void)n_cache_bytes;
+  (void)is_validation;
+#endif  // defined(XGBOOST_USE_CUDA)
+  return {cache_host_ratio, min_cache_page_bytes};
 }
 }  // namespace xgboost::data::detail
diff --git a/src/data/batch_utils.h b/src/data/batch_utils.h
index 96c8a7095469..08fbdef701bc 100644
--- a/src/data/batch_utils.h
+++ b/src/data/batch_utils.h
@@ -4,8 +4,11 @@
 #ifndef XGBOOST_DATA_BATCH_UTILS_H_
 #define XGBOOST_DATA_BATCH_UTILS_H_
 
-#include <cmath>   // for isnan
-#include <limits>  // for numeric_limits
+#include <cmath>    // for isnan
+#include <cstddef>  // for size_t
+#include <cstdint>  // for int64_t
+#include <limits>   // for numeric_limits
+#include <utility>  // for pair
 
 #include "xgboost/data.h"  // for BatchParam
 
@@ -39,19 +42,34 @@ inline bool RegenGHist(BatchParam old, BatchParam p) {
 void CheckParam(BatchParam const& init, BatchParam const& param);
 
 /**
- * @brief Get the default host ratio.
+ * @brief Configure the `cache_host_ratio` and the `min_cache_page_bytes`.
  */
-[[nodiscard]] float DftHostRatio(float cache_host_ratio, bool is_validation);
+[[nodiscard]] std::pair<double, std::int64_t> DftPageSizeHostRatio(
+    std::size_t n_cache_bytes, bool is_validation, double cache_host_ratio,
+    std::int64_t min_cache_page_bytes);
 
+/**
+ * @brief Check whether we should configure `cache_host_ratio`.
+ *
+ * Defined by @ref AutoHostRatio .
+ */
 [[nodiscard]] inline bool HostRatioIsAuto(float cache_host_ratio) {
   return std::isnan(cache_host_ratio);
 }
+/**
+ * @brief Check whether we should configure `min_cache_page_bytes`.
+ *
+ * Defined by @ref AutoCachePageBytes .
+ */
+[[nodiscard]] inline bool CachePageBytesIsAuto(std::int64_t min_cache_page_bytes) {
+  return min_cache_page_bytes == -1;
+}
 }  // namespace xgboost::data::detail
 
 namespace xgboost::cuda_impl {
 // Indicator for XGBoost to not concatenate any page.
 constexpr std::int64_t MatchingPageBytes() { return 0; }
-// Default size of the cached page
+// Default size of the cached page, 1/8
 constexpr double CachePageRatio() { return 0.125; }
 // Indicator for XGBoost to automatically concatenate pages.
 constexpr std::int64_t AutoCachePageBytes() { return -1; }
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index ee4aafa6a08d..b5ed579ad6f6 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -725,10 +725,11 @@ std::size_t EllpackPageImpl::MemCostBytes() const {
   auto null = this->NullValue();
 
   h_gidx_buffer->resize(this->gidx_buffer.size() + this->d_gidx_buffer.size());
-  CHECK_NE(gidx_buffer.size(), 0);
-  dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), this->gidx_buffer.data(),
-                                this->gidx_buffer.size_bytes(), cudaMemcpyDefault,
-                                ctx->CUDACtx()->Stream()));
+  if (!this->gidx_buffer.empty()) {
+    dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), this->gidx_buffer.data(),
+                                  this->gidx_buffer.size_bytes(), cudaMemcpyDefault,
+                                  ctx->CUDACtx()->Stream()));
+  }
 
   if (!d_gidx_buffer.empty()) {
     auto dst = h_gidx_buffer->data() + this->gidx_buffer.size_bytes();
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 88f2dd389d6a..4f4c862e4a8e 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -34,8 +34,8 @@ EllpackMemCache::EllpackMemCache(EllpackCacheInfo cinfo, std::int32_t n_workers)
       streams{std::make_unique<curt::StreamPool>(n_workers)} {
   CHECK_EQ(buffer_bytes.size(), buffer_rows.size());
   CHECK(!detail::HostRatioIsAuto(this->cache_host_ratio));
-  CHECK_GE(this->cache_host_ratio, 0) << error::CacheHostRatioInvalid();
-  CHECK_LE(this->cache_host_ratio, 1) << error::CacheHostRatioInvalid();
+  CHECK_GE(this->cache_host_ratio, 0.0) << error::CacheHostRatioInvalid();
+  CHECK_LE(this->cache_host_ratio, 1.0) << error::CacheHostRatioInvalid();
 }
 
 EllpackMemCache::~EllpackMemCache() = default;
@@ -131,12 +131,14 @@ class EllpackHostCacheStreamImpl {
 
     // Get the size of the host cache.
     auto get_host_nbytes = [&](EllpackPageImpl const* old_impl) {
+      // Special handling due to floating points.
       if (this->cache_->cache_host_ratio == 1.0) {
         return old_impl->gidx_buffer.size_bytes();
       }
       if (this->cache_->cache_host_ratio == 0.0) {
         return static_cast<std::size_t>(0);
       }
+      // Calculate based on the `cache_host_ratio` parameter.
       auto n_bytes =
           std::max(static_cast<std::size_t>(old_impl->gidx_buffer.size_bytes() * cache_host_ratio),
                    std::size_t{1});
@@ -349,17 +351,36 @@ EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringVi
 void CalcCacheMapping(Context const* ctx, bool is_dense,
                       std::shared_ptr<common::HistogramCuts const> cuts,
                       std::int64_t min_cache_page_bytes, ExternalDataInfo const& ext_info,
-                      EllpackCacheInfo* cinfo) {
+                      bool is_validation, EllpackCacheInfo* cinfo) {
   CHECK(cinfo->param.Initialized()) << "Need to initialize scalar fields first.";
   auto ell_info = CalcNumSymbols(ctx, ext_info.row_stride, is_dense, cuts);
+
+  /**
+   * Configure the cache
+   */
+  // The total size of the cache.
+  std::size_t n_cache_bytes = 0;
+  for (std::size_t i = 0; i < ext_info.n_batches; ++i) {
+    auto n_samples = ext_info.base_rowids.at(i + 1) - ext_info.base_rowids[i];
+    auto n_bytes = common::CompressedBufferWriter::CalculateBufferSize(
+        ext_info.row_stride * n_samples, ell_info.n_symbols);
+    n_cache_bytes += n_bytes;
+  }
+  std::tie(cinfo->cache_host_ratio, min_cache_page_bytes) = detail::DftPageSizeHostRatio(
+      n_cache_bytes, is_validation, cinfo->cache_host_ratio, min_cache_page_bytes);
+
+  /**
+   * Calculate the cache buffer size
+   */
   std::vector<std::size_t> cache_bytes;
   std::vector<std::size_t> cache_mapping(ext_info.n_batches, 0);
   std::vector<std::size_t> cache_rows;
 
   for (std::size_t i = 0; i < ext_info.n_batches; ++i) {
-    auto n_samples = ext_info.base_rowids.at(i + 1) - ext_info.base_rowids[i];
+    auto n_samples = ext_info.base_rowids[i+1] - ext_info.base_rowids[i];
     auto n_bytes = common::CompressedBufferWriter::CalculateBufferSize(
         ext_info.row_stride * n_samples, ell_info.n_symbols);
+
     if (cache_bytes.empty()) {
       // Push the first page
       cache_bytes.push_back(n_bytes);
@@ -380,11 +401,14 @@ void CalcCacheMapping(Context const* ctx, bool is_dense,
   cinfo->cache_mapping = std::move(cache_mapping);
   cinfo->buffer_bytes = std::move(cache_bytes);
   cinfo->buffer_rows = std::move(cache_rows);
+
   // Directly store in device if there's only one batch.
   if (cinfo->NumBatchesCc() == 1) {
     cinfo->cache_host_ratio = 0.0;
-    LOG(INFO) << "Prefer device cache as there's only 1 page.";
   }
+
+  LOG(INFO) << "`cache_host_ratio`=" << cinfo->cache_host_ratio
+            << " `min_cache_page_bytes`=" << min_cache_page_bytes;
 }
 
 /**
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 45bfd61d542b..09e98afb4326 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -30,14 +30,14 @@ class StreamPool;
 namespace xgboost::data {
 struct EllpackCacheInfo {
   BatchParam param;
-  float cache_host_ratio{1.0};  // The size ratio the host cache vs. the total cache
+  double cache_host_ratio{1.0};  // The size ratio the host cache vs. the total cache
   float missing{std::numeric_limits<float>::quiet_NaN()};
   std::vector<bst_idx_t> cache_mapping;
   std::vector<bst_idx_t> buffer_bytes;  // N bytes of the concatenated pages.
   std::vector<bst_idx_t> buffer_rows;
 
   EllpackCacheInfo() = default;
-  EllpackCacheInfo(BatchParam param, float h_ratio, float missing)
+  EllpackCacheInfo(BatchParam param, double h_ratio, float missing)
       : param{std::move(param)}, cache_host_ratio{h_ratio}, missing{missing} {}
 
   // Only effective for host-based cache.
@@ -68,7 +68,7 @@ struct EllpackMemCache {
   // Cache info
   std::vector<std::size_t> const buffer_bytes;
   std::vector<bst_idx_t> const buffer_rows;
-  float const cache_host_ratio;
+  double const cache_host_ratio;
 
   std::unique_ptr<curt::StreamPool> streams;
 
@@ -254,7 +254,7 @@ class EllpackMmapStreamPolicy : public F<S> {
 void CalcCacheMapping(Context const* ctx, bool is_dense,
                       std::shared_ptr<common::HistogramCuts const> cuts,
                       std::int64_t min_cache_page_bytes, ExternalDataInfo const& ext_info,
-                      EllpackCacheInfo* cinfo);
+                      bool is_validation, EllpackCacheInfo* cinfo);
 
 /**
  * @brief Ellpack source with sparse pages as the underlying source.
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
index af48ebf563ba..afb746e6eabd 100644
--- a/src/data/extmem_quantile_dmatrix.cu
+++ b/src/data/extmem_quantile_dmatrix.cu
@@ -61,10 +61,9 @@ void ExtMemQuantileDMatrix::InitFromCUDA(
    * Calculate cache info
    */
   auto is_validation = (ref != nullptr);
-  auto cinfo = EllpackCacheInfo{p, detail::DftHostRatio(config.cache_host_ratio, is_validation),
-                                config.missing};
-  CalcCacheMapping(ctx, this->info_.IsDense(), cuts,
-                   detail::DftMinCachePageBytes(config.min_cache_page_bytes), ext_info, &cinfo);
+  auto cinfo = EllpackCacheInfo{p, config.cache_host_ratio, config.missing};
+  CalcCacheMapping(ctx, this->info_.IsDense(), cuts, config.min_cache_page_bytes, ext_info,
+                   is_validation, &cinfo);
   CHECK_EQ(cinfo.cache_mapping.size(), ext_info.n_batches);
   CHECK_GE(cinfo.cache_host_ratio, 0.0);
   CHECK_LE(cinfo.cache_host_ratio, 1.0);
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index dddb3058c914..62b822810ae8 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -34,7 +34,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
       missing_{config.missing},
       cache_prefix_{config.cache},
       on_host_{config.on_host},
-      cache_host_ratio_{detail::DftHostRatio(config.cache_host_ratio, true)},
+      cache_host_ratio_{config.cache_host_ratio},
       min_cache_page_bytes_{config.min_cache_page_bytes} {
   CHECK(detail::HostRatioIsAuto(config.cache_host_ratio)) << error::CacheHostRatioNotImpl();
   Context ctx;
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 568f99f9a369..dd3857cdab4d 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -54,7 +54,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     }
 
     auto cinfo = EllpackCacheInfo{param, this->cache_host_ratio_, this->missing_};
-    CalcCacheMapping(ctx, this->IsDense(), cuts, min_cache_page_bytes_, this->ext_info_, &cinfo);
+    CalcCacheMapping(ctx, this->IsDense(), cuts, min_cache_page_bytes_, this->ext_info_, true,
+                     &cinfo);
     CHECK_EQ(cinfo.cache_mapping.size(), this->ext_info_.n_batches)
         << "Page concatenation is only supported by the `ExtMemQuantileDMatrix`.";
     std::visit(
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 2d1b2f3c5c6c..2a3b01432a62 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -20,10 +20,9 @@
 #include "../../../src/common/io.h"
 #include "../../../src/data/adapter.h"              // for ArrayAdapter
 #include "../../../src/data/array_interface.h"      // for ArrayInterface
-#include "../../../src/data/batch_utils.h"          // for MatchingPageBytes, DftHostRatio
+#include "../../../src/data/batch_utils.h"          // for MatchingPageBytes
 #include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
 #include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
-#include "../../../src/data/proxy_dmatrix.h"        // for DMatrixProxy
 #include "../../../src/data/sparse_page_dmatrix.h"  // for SparsePageDMatrix
 #include "../helpers.h"
 
diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc
index abc760ec2ab9..c615d1843cbb 100644
--- a/tests/cpp/common/test_common.cc
+++ b/tests/cpp/common/test_common.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -16,4 +16,18 @@ TEST(Common, HumanMemUnit) {
   name = HumanMemUnit(1);
   ASSERT_EQ(name, "1B");
 }
+
+TEST(Common, TrimLast) {
+  {
+    std::string in{"foobar "};
+    auto out = TrimLast(in);
+    ASSERT_EQ(out, "foobar");
+  }
+  {
+    std::string in{R"(foobar
+)"};
+    auto out = TrimLast(in);
+    ASSERT_EQ(out, "foobar");
+  }
+}
 }  // namespace xgboost::common
diff --git a/tests/cpp/common/test_cuda_dr_utils.cc b/tests/cpp/common/test_cuda_dr_utils.cc
index 64596b7f371c..5876c11e376b 100644
--- a/tests/cpp/common/test_cuda_dr_utils.cc
+++ b/tests/cpp/common/test_cuda_dr_utils.cc
@@ -19,5 +19,37 @@ TEST(DrUtils, GetVersionFromSmi) {
     EXPECT_EQ(minor, -1);
   }
 }
+
+TEST(DrUtils, GetC2cLinkCountFromSmi) {
+  {
+    auto out = R"(GPU 0: NVIDIA GH200 480GB (UUID: GPU-********-****-****-****-************)
+    C2C Link 0: 44.712 GB/s
+    C2C Link 1: 44.712 GB/s
+    C2C Link 2: 44.712 GB/s
+    C2C Link 3: 44.712 GB/s
+    C2C Link 4: 44.712 GB/s
+    C2C Link 5: 44.712 GB/s
+    C2C Link 6: 44.712 GB/s
+    C2C Link 7: 44.712 GB/s
+    C2C Link 8: 44.712 GB/s
+    C2C Link 9: 44.712 GB/s
+  )";
+    auto lc = detail::GetC2cLinkCountFromSmiImpl(out);
+    ASSERT_EQ(lc, 10);
+  }
+  {
+    auto out = R"(No Devices support C2C.
+)";
+    auto lc = detail::GetC2cLinkCountFromSmiImpl(out);
+    ASSERT_EQ(lc, -1);
+  }
+
+  {
+    [[maybe_unused]] auto _ = GetC2cLinkCountFromSmi();
+  }
+  {
+    [[maybe_unused]] auto _ = GetC2cLinkCountFromSmiGlobal();
+  }
+}
 }  // namespace xgboost::cudr
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index 365edbb32eb9..d93e1ca8d3ef 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -13,8 +13,8 @@
 
 namespace xgboost::common {
 TEST(MemoryFixSizeBuffer, Seek) {
-  size_t constexpr kSize { 64 };
-  std::vector<int32_t> memory( kSize );
+  size_t constexpr kSize{64};
+  std::vector<int32_t> memory(kSize);
   MemoryFixSizeBuffer buf(memory.data(), memory.size());
   buf.Seek(MemoryFixSizeBuffer::kSeekEnd);
   size_t end = buf.Tell();
@@ -22,13 +22,13 @@ TEST(MemoryFixSizeBuffer, Seek) {
 }
 
 TEST(IO, FileExtension) {
-  std::string filename {u8"model.json"};
+  std::string filename{u8"model.json"};
   auto ext = FileExtension(filename);
   ASSERT_EQ(ext, u8"json");
 }
 
 TEST(IO, FixedSizeStream) {
-  std::string buffer {"This is the content of stream"};
+  std::string buffer{"This is the content of stream"};
   {
     MemoryFixSizeBuffer stream(static_cast<void *>(&buffer[0]), buffer.size());
     PeekableInStream peekable(&stream);
@@ -45,7 +45,7 @@ TEST(IO, FixedSizeStream) {
       huge_buffer += buffer;
     }
 
-    MemoryFixSizeBuffer stream(static_cast<void*>(&huge_buffer[0]), huge_buffer.size());
+    MemoryFixSizeBuffer stream(static_cast<void *>(&huge_buffer[0]), huge_buffer.size());
     PeekableInStream peekable(&stream);
     FixedSizeStream fixed(&peekable);
 
@@ -145,8 +145,7 @@ TEST(IO, Resource) {
     fout << 1.0 << std::endl;
     fout.close();
 
-    auto resource = std::shared_ptr<MmapResource>{
-      new MmapResource{path, 0, sizeof(double)}};
+    auto resource = std::shared_ptr<MmapResource>{new MmapResource{path, 0, sizeof(double)}};
     ASSERT_EQ(resource->Size(), sizeof(double));
     ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
     ASSERT_EQ(resource->DataAs<double>()[0], val);
@@ -211,8 +210,7 @@ class TestFileStream : public ::testing::Test {
     for (std::size_t i = 0; i < n_batches; ++i) {
       std::size_t off = offset[i];
       std::size_t n = offset.at(i + 1) - offset[i];
-      std::unique_ptr<AlignedResourceReadStream> fi{
-          std::make_unique<TestStreamT>(path, off, n)};
+      std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<TestStreamT>(path, off, n)};
       std::vector<T> data;
 
       std::uint64_t size{0};
@@ -229,4 +227,11 @@ class TestFileStream : public ::testing::Test {
 TEST_F(TestFileStream, PrivateMmapStream) { this->Run<PrivateMmapConstStream>(); }
 
 TEST_F(TestFileStream, MemBufFileReadStream) { this->Run<MemBufFileReadStream>(); }
-}  // namespace xgboost::common
+
+TEST(IO, CmdOutput) {
+  // Use a simple command that works in cmd.exe
+  std::string output = CmdOutput("echo HelloWorld");
+  ASSERT_EQ(output, R"(HelloWorld
+)");
+}
+}  // namespace xgboost::common
\ No newline at end of file
diff --git a/tests/cpp/data/test_batch_utils.cu b/tests/cpp/data/test_batch_utils.cu
new file mode 100644
index 000000000000..81c2b592d053
--- /dev/null
+++ b/tests/cpp/data/test_batch_utils.cu
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <cstdint>  // for int64_t
+#include <tuple>    // for tie
+
+#include "../../../src/common/cuda_rt_utils.h"  // for TotalMemory
+#include "../../../src/data/batch_utils.h"      // for AutoHostRatio
+#include "../helpers.h"
+
+namespace xgboost::data {
+TEST(BatchUtils, CacheHostRatio) {
+  {
+    bst_idx_t n_cache_bytes = 128;
+    double cache_host_ratio = ::xgboost::cuda_impl::AutoHostRatio();
+    std::int64_t min_cache_page_bytes = ::xgboost::cuda_impl::AutoCachePageBytes();
+    std::tie(cache_host_ratio, min_cache_page_bytes) =
+        detail::DftPageSizeHostRatio(n_cache_bytes, false, cache_host_ratio, min_cache_page_bytes);
+    ASSERT_EQ(cache_host_ratio, 0.0);  // Assuming the device has more than 256 bytes of memory ..
+    ASSERT_GT(min_cache_page_bytes, 0);
+    ASSERT_THAT(
+        [&] {
+          [[maybe_unused]] auto r =
+              detail::DftPageSizeHostRatio(n_cache_bytes, false, 2.0, min_cache_page_bytes);
+        },
+        GMockThrow(R"(cache_host_ratio)"));
+  }
+  {
+    bst_idx_t constexpr kGB = 1024ul * 1024ul * 1024ul;
+    bst_idx_t n_cache_bytes = 1024ul * kGB;
+    double cache_host_ratio = ::xgboost::cuda_impl::AutoHostRatio();
+    std::int64_t min_cache_page_bytes = ::xgboost::cuda_impl::AutoCachePageBytes();
+    std::tie(cache_host_ratio, min_cache_page_bytes) =
+        detail::DftPageSizeHostRatio(n_cache_bytes, false, cache_host_ratio, min_cache_page_bytes);
+    ASSERT_GE(min_cache_page_bytes + 512, curt::TotalMemory() * cuda_impl::CachePageRatio() * 0.5);
+    ASSERT_GT(cache_host_ratio, (1.0 - curt::TotalMemory() / static_cast<double_t>(n_cache_bytes)));
+    ASSERT_LT(cache_host_ratio, (1.0 - curt::TotalMemory() / (3.0 * n_cache_bytes)));
+  }
+}
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 752bc68d0ebb..909052be8803 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -8,7 +8,7 @@
 #include "../../../src/data/ellpack_page_raw_format.h"  // for EllpackPageRawFormat
 #include "../../../src/data/ellpack_page_source.h"      // for EllpackFormatStreamPolicy
 #include "../../../src/tree/param.h"                    // for TrainParam
-#include "../../../src/data/batch_utils.h"              // for DftHostRatio
+#include "../../../src/data/batch_utils.h"              // for AutoHostRatio
 #include "../filesystem.h"                              // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
@@ -17,14 +17,14 @@ namespace {
 [[nodiscard]] EllpackCacheInfo CInfoForTest(Context const *ctx, DMatrix *Xy, bst_idx_t row_stride,
                                             BatchParam param,
                                             std::shared_ptr<common::HistogramCuts const> cuts) {
-  EllpackCacheInfo cinfo{param, detail::DftHostRatio(::xgboost::cuda_impl::AutoHostRatio(), false),
+  EllpackCacheInfo cinfo{param, ::xgboost::cuda_impl::AutoHostRatio(),
                          std::numeric_limits<float>::quiet_NaN()};
   ExternalDataInfo ext_info;
   ext_info.n_batches = 1;
   ext_info.row_stride = row_stride;
   ext_info.base_rowids.push_back(Xy->Info().num_row_);
 
-  CalcCacheMapping(ctx, Xy->IsDense(), cuts, 0, ext_info, &cinfo);
+  CalcCacheMapping(ctx, Xy->IsDense(), cuts, 0, ext_info, false, &cinfo);
   CHECK_EQ(ext_info.n_batches, cinfo.cache_mapping.size());
   if (cinfo.NumBatchesCc() == 1) {
     EXPECT_EQ(cinfo.cache_host_ratio, 0.0);
@@ -121,10 +121,11 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
       auto p_fmat = RandomDataGenerator{100, 14, 0.5}.Seed(i).GenerateDMatrix();
       for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
         if (!format) {
+          auto n_cache_bytes = page.Impl()->MemCostBytes() * 3;
           // Prepare the mapping info.
-          EllpackCacheInfo cinfo{param,
-                                 detail::DftHostRatio(::xgboost::cuda_impl::AutoHostRatio(), false),
-                                 std::numeric_limits<float>::quiet_NaN()};
+          auto [cache_host_ratio, min_cache_page_bytes] = detail::DftPageSizeHostRatio(
+              n_cache_bytes, false, 1.0, ::xgboost::cuda_impl::AutoCachePageBytes());
+          EllpackCacheInfo cinfo{param, cache_host_ratio, std::numeric_limits<float>::quiet_NaN()};
           for (std::size_t i = 0; i < 3; ++i) {
             cinfo.cache_mapping.push_back(i);
             cinfo.buffer_bytes.push_back(page.Impl()->MemCostBytes());
@@ -176,9 +177,7 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
   bst_idx_t n_features = 16, n_samples = 128;
 
   auto test = [&](std::int64_t min_cache_page_bytes, float cache_host_ratio) {
-    EllpackCacheInfo cinfo{param,
-                           detail::DftHostRatio(cache_host_ratio, false),
-                           std::numeric_limits<float>::quiet_NaN()};
+    EllpackCacheInfo cinfo{param, cache_host_ratio, std::numeric_limits<float>::quiet_NaN()};
     ExternalDataInfo ext_info;
 
     ext_info.n_batches = 8;
@@ -197,7 +196,8 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
     for (auto const &page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
       auto cuts = page.Impl()->CutsShared();
       EXPECT_TRUE(page.Impl()->IsDense());
-      CalcCacheMapping(&ctx, page.Impl()->IsDense(), cuts, min_cache_page_bytes, ext_info, &cinfo);
+      CalcCacheMapping(&ctx, page.Impl()->IsDense(), cuts, min_cache_page_bytes, ext_info, false,
+                       &cinfo);
       EXPECT_EQ(cinfo.buffer_rows.size(), 4ul);
       policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
     }
@@ -223,7 +223,7 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
     auto mem_cache = test(n_features * n_samples, ::xgboost::cuda_impl::AutoHostRatio());
     ASSERT_EQ(mem_cache->h_pages.size(), 4);
     ASSERT_EQ(mem_cache->d_pages.size(), 4);
-    ASSERT_TRUE(mem_cache->d_pages[0].empty());
+    ASSERT_FALSE(mem_cache->d_pages[0].empty());
   }
   {
     float cache_host_ratio = 0.65;
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index 64aa92691dcf..251060ffd87e 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -7,10 +7,10 @@
 #include <tuple>   // for tuple
 #include <vector>  // for vector
 
-#include "../../../src/data/batch_utils.h"              // for AutoHostRatio, DftHostRatio
-#include "../../../src/data/ellpack_page.cuh"           // for EllpackPageImpl
-#include "../helpers.h"                                 // for RandomDataGenerator, GMockThrow
-#include "test_extmem_quantile_dmatrix.h"               // for TestExtMemQdmBasic
+#include "../../../src/data/batch_utils.h"     // for AutoHostRatio
+#include "../../../src/data/ellpack_page.cuh"  // for EllpackPageImpl
+#include "../helpers.h"                        // for RandomDataGenerator, GMockThrow
+#include "test_extmem_quantile_dmatrix.h"      // for TestExtMemQdmBasic
 
 namespace xgboost::data {
 auto AssertEllpackEq(Context const* ctx, EllpackPageImpl const* lhs, EllpackPageImpl const* rhs) {
@@ -116,16 +116,8 @@ TEST_P(EllpackHostCacheTest, Basic) {
   this->Run(sparsity, min_page_cache_bytes, cache_host_ratio);
 }
 
-INSTANTIATE_TEST_SUITE_P(ExtMemQuantileDMatrix, EllpackHostCacheTest,
-                         ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.4f, 0.8f),
-                                            ::testing::Bool(),
-                                            ::testing::Values(0.0f, 0.5f, 1.0f)));
-
-TEST(ExtMemQuantileDMatrixGpu, CacheHostRatio) {
-  auto cache_host_ratio = detail::DftHostRatio(::xgboost::cuda_impl::AutoHostRatio(), false);
-  ASSERT_GT(cache_host_ratio, 0.0);
-  ASSERT_LE(cache_host_ratio, 1.0);
-  ASSERT_THAT([&] { [[maybe_unused]] auto r = detail::DftHostRatio(2.0, false); },
-              GMockThrow(R"(cache_host_ratio)"));
-}
+INSTANTIATE_TEST_SUITE_P(
+    ExtMemQuantileDMatrix, EllpackHostCacheTest,
+    ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.4f, 0.8f), ::testing::Bool(),
+                       ::testing::Values(0.0f, 0.5f, 1.0f, ::xgboost::cuda_impl::AutoHostRatio())));
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 0e9c30b9da32..b84cb63ff91f 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -9,7 +9,7 @@
 
 #include "../../../src/common/io.h"
 #include "../../../src/data/adapter.h"
-#include "../../../src/data/batch_utils.h"  // for MatchingPageBytes, DftHostRatio
+#include "../../../src/data/batch_utils.h"  // for MatchingPageBytes
 #include "../../../src/data/file_iterator.h"
 #include "../../../src/data/simple_dmatrix.h"
 #include "../../../src/data/sparse_page_dmatrix.h"
diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
index 95a9d2b242ca..2adc8d8a221e 100644
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -239,8 +239,8 @@ def test_uneven_sizes() -> None:
 
 def test_cache_host_ratio() -> None:
     boosters = []
-    for min_cache_page_bytes in [0, 64, np.iinfo(np.int64).max]:
-        for cache_host_ratio in [0, 0.5, 1.0]:
+    for min_cache_page_bytes in [0, 64, np.iinfo(np.int64).max, None]:
+        for cache_host_ratio in [0, 0.5, 1.0, None]:
             it = tm.IteratorForTest(
                 *tm.make_batches(64, 16, 4, use_cupy=True),
                 cache=None,

From 1daba2a33700a31fb6f96267fad4ac946ea067f6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 23 May 2025 14:15:13 +0800
Subject: [PATCH 058/224] [EM] By pass device copy if the page fits into device
 memory. (#11477)

---
 src/data/ellpack_page.cu                      |  2 +-
 src/data/ellpack_page.cuh                     |  2 +-
 src/data/ellpack_page_source.cu               | 11 ++++-
 .../cpp/data/test_extmem_quantile_dmatrix.cu  | 47 +++++++++++++++++++
 4 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index b5ed579ad6f6..185c6171ad61 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -693,7 +693,7 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& ro
 // Return the number of rows contained in this page.
 [[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; }
 
-std::size_t EllpackPageImpl::MemCostBytes() const {
+[[nodiscard]] std::size_t EllpackPageImpl::MemCostBytes() const {
   return this->gidx_buffer.size_bytes() + sizeof(this->is_dense) + sizeof(this->n_rows) +
          sizeof(this->base_rowid) + sizeof(this->info) + this->d_gidx_buffer.size_bytes();
 }
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 26d6f7520011..6e45985e1fb6 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -274,7 +274,7 @@ class EllpackPageImpl {
   }
 
   /** @return Estimation of memory cost of this page. */
-  std::size_t MemCostBytes() const;
+  [[nodiscard]] std::size_t MemCostBytes() const;
 
   /**
    * @brief Return the total number of symbols (total number of bins plus 1 for not
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 4f4c862e4a8e..4cd97154b118 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -230,10 +230,17 @@ class EllpackHostCacheStreamImpl {
   void Read(EllpackPage* out, bool prefetch_copy) const {
     CHECK_EQ(this->cache_->h_pages.size(), this->cache_->d_pages.size());
     auto [h_page, d_page] = this->cache_->At(this->ptr_);
-
+    // Skip copy if the full page is on device
+    bool on_device = h_page->gidx_buffer.empty() && !d_page->empty();
     auto ctx = Context{}.MakeCUDA(dh::CurrentDevice());
     auto out_impl = out->Impl();
-    if (prefetch_copy) {
+    if (on_device) {
+      CHECK(h_page->gidx_buffer.empty());
+      auto d_res = d_page->Resource();
+      out_impl->gidx_buffer = common::RefResourceView<common::CompressedByteT>{
+          d_res->DataAs<common::CompressedByteT>(), d_page->size(), d_res};
+      CHECK(out_impl->d_gidx_buffer.empty());
+    } else if (prefetch_copy) {
       auto n_bytes = this->cache_->GidxSizeBytes(this->ptr_);
       out_impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(n_bytes);
       if (!h_page->gidx_buffer.empty()) {
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index 251060ffd87e..2fede5de9eb9 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -120,4 +120,51 @@ INSTANTIATE_TEST_SUITE_P(
     ExtMemQuantileDMatrix, EllpackHostCacheTest,
     ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.4f, 0.8f), ::testing::Bool(),
                        ::testing::Values(0.0f, 0.5f, 1.0f, ::xgboost::cuda_impl::AutoHostRatio())));
+
+TEST(EllpackHostCacheTest, Accessor) {
+  auto ctx = MakeCUDACtx(0);
+  auto param = BatchParam{32, tree::TrainParam::DftSparseThreshold()};
+  param.prefetch_copy = false;
+  std::size_t n_bytes = 0;
+  {
+    auto p_ext_fmat = RandomDataGenerator{128, 16, 0.0}
+                          .Batches(4)
+                          .Bins(param.max_bin)
+                          .Device(ctx.Device())
+                          .OnHost(true)
+                          .MinPageCacheBytes(1024 * 1024 * 1024)
+                          .CacheHostRatio(0.0)
+                          .GenerateExtMemQuantileDMatrix("temp", true);
+    ASSERT_EQ(p_ext_fmat->NumBatches(), 1);
+
+    for (auto const& page : p_ext_fmat->GetBatches<EllpackPage>(&ctx, param)) {
+      auto acc = page.Impl()->GetDeviceEllpack(&ctx, {});
+      // Fully on device
+      auto dacc = std::get_if<EllpackDeviceAccessor>(&acc);
+      ASSERT_TRUE(dacc);
+      n_bytes = page.Impl()->MemCostBytes();
+    }
+  }
+  if (!curt::SupportsPageableMem()) {
+    GTEST_SKIP_("Requires HMM or ATS.");
+  }
+  {
+    std::size_t n_pages = 2;  // split for 2 pages
+    auto p_ext_fmat = RandomDataGenerator{128, 16, 0.0}
+                          .Batches(4)
+                          .Bins(param.max_bin)
+                          .Device(ctx.Device())
+                          .OnHost(true)
+                          .MinPageCacheBytes(n_bytes / n_pages)
+                          .CacheHostRatio(0.5)
+                          .GenerateExtMemQuantileDMatrix("temp", true);
+    ASSERT_EQ(p_ext_fmat->NumBatches(), n_pages);
+    for (auto const& page : p_ext_fmat->GetBatches<EllpackPage>(&ctx, param)) {
+      auto acc = page.Impl()->GetDeviceEllpack(&ctx, {});
+      // Host + device
+      auto dacc = std::get_if<DoubleEllpackAccessor>(&acc);
+      ASSERT_TRUE(dacc);
+    }
+  }
+}
 }  // namespace xgboost::data

From 614cd5478bb3c7ef15683ea30c5796b01d41ffbd Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 27 May 2025 01:31:47 -0400
Subject: [PATCH 059/224] Update install doc for Conda (#11483) (#11486)

---
 doc/install.rst | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/doc/install.rst b/doc/install.rst
index 527ec061bf15..7fcea0d3b68c 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -104,25 +104,20 @@ Conda should be able to detect the existence of a GPU on your machine and instal
 
 .. code-block:: bash
 
-   # CPU only
-   conda install -c conda-forge py-xgboost-cpu
-   # Use NVIDIA GPU
-   conda install -c conda-forge py-xgboost-gpu
+   # CPU variant
+   conda install -c conda-forge py-xgboost=*=cpu*
+   # GPU variant
+   conda install -c conda-forge py-xgboost=*=cuda*
 
 To force the installation of the GPU variant on a machine that does not have an NVIDIA GPU, use environment variable ``CONDA_OVERRIDE_CUDA``,
 as described in `"Managing Virtual Packages" in the conda docs <https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html>`_.
 
 .. code-block:: bash
 
-  export CONDA_OVERRIDE_CUDA="12.5"
-  conda install -c conda-forge py-xgboost-gpu
+  export CONDA_OVERRIDE_CUDA="12.8"
+  conda install -c conda-forge py-xgboost=*=cuda*
 
-Visit the `Miniconda website <https://docs.conda.io/en/latest/miniconda.html>`_ to obtain Conda.
-
-.. note:: ``py-xgboost-gpu`` not available on Windows.
-
-   The ``py-xgboost-gpu`` is currently not available on Windows. If you are using Windows,
-   please use ``pip`` to install XGBoost with GPU support.
+You can install Conda from the following link: `Download the conda-forge Installer <https://conda-forge.org/download/>`_.
 
 R
 -

From 307ee8834d14dc4a776100cc6e23474ab2f1f393 Mon Sep 17 00:00:00 2001
From: Mark Drozd <mark.drozd@mapbox.com>
Date: Thu, 5 Jun 2025 03:09:13 +0300
Subject: [PATCH 060/224] adds explicit <=> operators to IndexTransformIter
 (#11495)

---
 src/common/transform_iterator.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common/transform_iterator.h b/src/common/transform_iterator.h
index 15e2279fa159..6f20697c4386 100644
--- a/src/common/transform_iterator.h
+++ b/src/common/transform_iterator.h
@@ -52,6 +52,10 @@ class IndexTransformIter {
   auto operator-(IndexTransformIter const &that) const { return iter_ - that.iter_; }
   bool operator==(IndexTransformIter const &that) const { return iter_ == that.iter_; }
   bool operator!=(IndexTransformIter const &that) const { return !(*this == that); }
+  bool operator<(IndexTransformIter const &that) const { return iter_ < that.iter_; }
+  bool operator>(IndexTransformIter const &that) const { return that < *this; }
+  bool operator<=(IndexTransformIter const &that) const { return !(that < *this); }
+  bool operator>=(IndexTransformIter const &that) const { return !(*this < that); }
 
   IndexTransformIter &operator++() {
     iter_++;

From 76fde56c911d50c63c5d372605efae7ac34ba180 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 6 Jun 2025 11:14:39 -0700
Subject: [PATCH 061/224] Make xgboost.testing compatible with scikit-learn 1.7
 (#11502)

---
 python-package/xgboost/testing/updater.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index acd53116b7cf..868b6f0d0e90 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -11,6 +11,7 @@
 import xgboost as xgb
 import xgboost.testing as tm
 from xgboost.data import is_pd_cat_dtype
+from xgboost.core import _parse_version
 
 from ..core import DataIter
 from .data_iter import CatIter
@@ -653,6 +654,7 @@ def run_adaptive(tree_method: str, weighted: bool, device: Device) -> None:
     rng = np.random.RandomState(1994)
     from sklearn.datasets import make_regression
     from sklearn.utils import stats
+    from sklearn import __version__ as sklearn_version
 
     n_samples = 256
     X, y = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
@@ -662,8 +664,14 @@ def run_adaptive(tree_method: str, weighted: bool, device: Device) -> None:
         w = rng.normal(size=n_samples)
         w -= w.min()
         Xy = xgb.DMatrix(X, y, weight=w)
+
+        (sk_major, sk_minor, _), _ = _parse_version(sklearn_version)
+        if sk_major > 1 or sk_minor >= 7:
+            kwargs = {"percentile_rank": 50}
+        else:
+            kwargs = {"percentile": 50}
         base_score = stats._weighted_percentile(  # pylint: disable=protected-access
-            y, w, percentile=50
+            y, w, **kwargs
         )
     else:
         Xy = xgb.DMatrix(X, y)

From a984865c4924bd280be0b12d5fd3141ce618fae7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 6 Jun 2025 15:20:09 -0700
Subject: [PATCH 062/224] [EM] Log ellpack read throughput. (#11497)

---
 python-package/xgboost/callback.py        |  6 +++---
 python-package/xgboost/collective.py      |  3 ++-
 python-package/xgboost/dask/__init__.py   |  1 +
 python-package/xgboost/testing/data.py    | 12 ++++++------
 python-package/xgboost/testing/updater.py |  4 ++--
 src/common/device_helpers.cuh             |  8 +++++---
 src/data/ellpack_page_raw_format.cu       | 23 +++++++++++++++++++++--
 7 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index c5d4f35580d9..abc42781a9ea 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -177,7 +177,7 @@ def __init__(
         self.is_cv = is_cv
 
         if self.is_cv:
-            self.aggregated_cv = None
+            self.aggregated_cv: Optional[list[tuple[str, float, float]]] = None
 
     def before_training(self, model: _Model) -> _Model:
         """Function called before training."""
@@ -386,8 +386,8 @@ def before_training(self, model: _Model) -> _Model:
         self.starting_round = model.num_boosted_rounds()
         if not isinstance(model, Booster) and self.save_best:
             raise ValueError(
-                "`save_best` is not applicable to the `cv` function as it doesn't return"
-                " a model."
+                "`save_best` is not applicable to the `cv` function as it doesn't"
+                " return a model."
             )
         return model
 
diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py
index 715853d0ab54..5bd0aedeec7c 100644
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@@ -16,6 +16,7 @@
 LOGGER = logging.getLogger("[xgboost.collective]")
 
 
+_Conf: TypeAlias = Dict[str, Union[int, str]]
 _ArgVals: TypeAlias = Optional[Union[int, str]]
 _Args: TypeAlias = Dict[str, _ArgVals]
 
@@ -53,7 +54,7 @@ class Config:
     tracker_port: Optional[int] = None
     tracker_timeout: Optional[int] = None
 
-    def get_comm_config(self, args: _Args) -> _Args:
+    def get_comm_config(self, args: _Conf) -> _Conf:
         """Update the arguments for the communicator."""
         if self.retry is not None:
             args["dmlc_retry"] = self.retry
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index fdd495549f2f..7e04ffe9e5fe 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -695,6 +695,7 @@ async def _get_rabit_args(
         _start_tracker, n_workers, sched_addr, user_addr, coll_cfg.tracker_timeout
     )
     env = coll_cfg.get_comm_config(env)
+    assert env is not None
     return env
 
 
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 36367cdc26db..376829a06f6a 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -46,7 +46,7 @@
 
 def np_dtypes(
     n_samples: int, n_features: int
-) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
+) -> Generator[Union[Tuple[np.ndarray, np.ndarray], Tuple[list, list]], None, None]:
     """Enumerate all supported dtypes from numpy."""
     pd = pytest.importorskip("pandas")
 
@@ -92,12 +92,12 @@ def np_dtypes(
     orig = rng.binomial(1, 0.5, size=n_samples * n_features).reshape(
         n_samples, n_features
     )
-    for dtype in [np.bool_, bool]:
-        X = np.array(orig, dtype=dtype)
+    for dtype1 in [np.bool_, bool]:
+        X = np.array(orig, dtype=dtype1)
         yield orig, X
 
-    for dtype in [np.bool_, bool]:
-        X = np.array(orig, dtype=dtype)
+    for dtype2 in [np.bool_, bool]:
+        X = np.array(orig, dtype=dtype2)
         df_orig = pd.DataFrame(orig)
         df = pd.DataFrame(X)
         yield df_orig, df
@@ -660,7 +660,7 @@ def init_rank_score(
     # random sample
     rng = np.random.default_rng(1994)
     n_samples = int(X.shape[0] * sample_rate)
-    index = np.arange(0, X.shape[0], dtype=np.uint64)
+    index: npt.NDArray = np.arange(0, X.shape[0], dtype=np.uint64)
     rng.shuffle(index)
     index = index[:n_samples]
 
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index 868b6f0d0e90..f49edd1e3983 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -10,8 +10,8 @@
 
 import xgboost as xgb
 import xgboost.testing as tm
-from xgboost.data import is_pd_cat_dtype
 from xgboost.core import _parse_version
+from xgboost.data import is_pd_cat_dtype
 
 from ..core import DataIter
 from .data_iter import CatIter
@@ -652,9 +652,9 @@ def run_invalid_category(tree_method: str, device: Device) -> None:
 def run_adaptive(tree_method: str, weighted: bool, device: Device) -> None:
     """Test for adaptive trees."""
     rng = np.random.RandomState(1994)
+    from sklearn import __version__ as sklearn_version
     from sklearn.datasets import make_regression
     from sklearn.utils import stats
-    from sklearn import __version__ as sklearn_version
 
     n_samples = 256
     X, y = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 3f7cb7a14c88..731b8398132b 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -721,10 +721,11 @@ class CUDAEvent {
   std::unique_ptr<cudaEvent_t, void (*)(cudaEvent_t *)> event_;
 
  public:
-  CUDAEvent()
-      : event_{[] {
+  explicit CUDAEvent(bool disable_timing = true)
+      : event_{[disable_timing] {
                  auto e = new cudaEvent_t;
-                 dh::safe_cuda(cudaEventCreateWithFlags(e, cudaEventDisableTiming));
+                 dh::safe_cuda(cudaEventCreateWithFlags(
+                     e, disable_timing ? cudaEventDisableTiming : cudaEventDefault));
                  return e;
                }(),
                [](cudaEvent_t *e) {
@@ -744,6 +745,7 @@ class CUDAEvent {
 
   operator cudaEvent_t() const { return *event_; }                // NOLINT
   cudaEvent_t const *data() const { return this->event_.get(); }  // NOLINT
+  void Sync() { dh::safe_cuda(cudaEventSynchronize(*this->data())); }
 };
 
 class CUDAStreamView {
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 9c8c039fc9c0..09e45eac9321 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -102,8 +102,27 @@ template <typename T>
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
 
-  fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
-  impl->SetCuts(this->cuts_);
+  auto dispatch = [&] {
+    fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
+    impl->SetCuts(this->cuts_);
+  };
+
+  if (ConsoleLogger::GlobalVerbosity() == ConsoleLogger::LogVerbosity::kDebug) {
+    dh::CUDAEvent start{false}, stop{false};
+    float milliseconds = 0;
+    start.Record(dh::DefaultStream());
+
+    dispatch();
+
+    stop.Record(dh::DefaultStream());
+    stop.Sync();
+    dh::safe_cuda(cudaEventElapsedTime(&milliseconds, start, stop));
+    double n_bytes = page->Impl()->MemCostBytes();
+    double tp = (n_bytes / static_cast<double>((1ul << 30))) * 1000.0 / milliseconds;
+    LOG(DEBUG) << "Ellpack " << __func__ << " throughput:" << tp << "GB/s";
+  } else {
+    dispatch();
+  }
 
   dh::DefaultStream().Sync();
 

From 0194a7115e166cfb8fc5937744ebbaaeeb684c3a Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sat, 7 Jun 2025 11:02:06 -0400
Subject: [PATCH 063/224] Keep my info updated in CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f96a7dc0d34c..8e00c343def0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -11,7 +11,7 @@ The Project Management Committee(PMC) consists group of active committers that m
 * [Michael Benesty](https://github.com/pommedeterresautee)
   - Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R.
 * [Yuan Tang](https://github.com/terrytangyuan), Red Hat
-  - Yuan is a principal software engineer at Red Hat. He contributed mostly in R and Python packages.
+  - Yuan is a Senior Principal Software Engineer at Red Hat AI. He contributed mostly in R and Python packages.
 * [Nan Zhu](https://github.com/CodingCat), Uber
   - Nan is a software engineer in Uber. He contributed mostly in JVM packages.
 * [Jiaming Yuan](https://github.com/trivialfis)

From e4724f67fc6705c59ec4feb363ee015dd74d6c93 Mon Sep 17 00:00:00 2001
From: nakanoh <hirofumi0081@gmail.com>
Date: Mon, 9 Jun 2025 21:33:54 +0900
Subject: [PATCH 064/224] Fix typos in CONTRIBUTORS.md: 'a open source project'
 -> 'an open source project' and 'Committers comes' -> 'Committers come'
 (#11503)

---
 CONTRIBUTORS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8e00c343def0..926ad43fa6f0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -43,8 +43,8 @@ Committers are people who have made substantial contribution to the project and
 
 Become a Committer
 ------------------
-XGBoost is a open source project and we are actively looking for new committers who are willing to help maintaining and lead the project.
-Committers comes from contributors who:
+XGBoost is an open source project and we are actively looking for new committers who are willing to help maintaining and lead the project.
+Committers come from contributors who:
 * Made substantial contribution to the project.
 * Willing to spent time on maintaining and lead the project.
 

From daebee407bfb0e0e0378ebd3ed1a236fa8f90090 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 9 Jun 2025 06:52:09 -0700
Subject: [PATCH 065/224] Use latest Rapids (#11323)

---
 tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 3f96f7f962f3..6aba66a02317 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -246,6 +246,7 @@ def test_uneven_nan(self) -> None:
                 check_uneven_nan(client, "hist", "cuda", n_workers)
 
     @pytest.mark.skipif(**tm.no_dask_cudf())
+    @pytest.mark.xfail(reason="Incompatible with Dask 2025.2.0+")
     def test_dask_dataframe(self, local_cuda_client: Client) -> None:
         run_with_dask_dataframe(dxgb.DaskDMatrix, local_cuda_client)
         run_with_dask_dataframe(dxgb.DaskQuantileDMatrix, local_cuda_client)

From d497d907d4c6723a846effc39689d38b8cd3fa08 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 10 Jun 2025 10:25:54 -0700
Subject: [PATCH 066/224] [CI] Test with Rapids 25.06 (#11504)

* [CI] Test with Rapids 25.06

* Compatibility shim to support CCCL 3.0

* More compat fix

* Variable naming convention

* Update get-image-tag.sh
---
 src/common/algorithm.cuh              | 44 +++++++++++++++++++--------
 src/common/hist_util.cuh              |  8 +++++
 src/tree/gpu_hist/evaluate_splits.cu  | 10 +++++-
 src/tree/gpu_hist/row_partitioner.cuh |  8 ++---
 4 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index e88eb1f0c9b1..4ea73d98a385 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -26,6 +26,15 @@
 
 namespace xgboost::common {
 namespace detail {
+
+#if CUB_VERSION >= 300000
+  constexpr auto kCubSortOrderAscending = cub::SortOrder::Ascending;
+  constexpr auto kCubSortOrderDescending = cub::SortOrder::Descending;
+#else
+  constexpr bool kCubSortOrderAscending = false;
+  constexpr bool kCubSortOrderDescending = true;
+#endif
+
 // Wrapper around cub sort to define is_decending
 template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT,
           typename EndOffsetIteratorT>
@@ -42,8 +51,9 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st
   cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
   cub::DoubleBuffer<cub::NullType> d_values;
 
+  constexpr auto kCubSortOrder = IS_DESCENDING ? kCubSortOrderDescending : kCubSortOrderAscending;
   dh::safe_cuda((cub::DispatchSegmentedRadixSort<
-                 IS_DESCENDING, KeyT, cub::NullType, BeginOffsetIteratorT, EndOffsetIteratorT,
+                 kCubSortOrder, KeyT, cub::NullType, BeginOffsetIteratorT, EndOffsetIteratorT,
                  OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
                                     num_segments, d_begin_offsets, d_end_offsets, begin_bit,
                                     end_bit, false, ctx->Stream(), debug_synchronous)));
@@ -68,21 +78,22 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
   CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
   // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
 
+  constexpr auto kCubSortOrder = descending ? kCubSortOrderDescending : kCubSortOrderAscending;
 #if THRUST_MAJOR_VERSION >= 2
   dh::safe_cuda((cub::DispatchSegmentedRadixSort<
-                 descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
+                 kCubSortOrder, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
                  OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
                                     num_segments, d_begin_offsets, d_end_offsets, begin_bit,
                                     end_bit, false, stream)));
 #elif (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13)
   dh::safe_cuda((cub::DispatchSegmentedRadixSort<
-                 descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
+                 kCubSortOrder, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
                  OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items,
                                     num_segments, d_begin_offsets, d_end_offsets, begin_bit,
                                     end_bit, false, stream, false)));
 #else
   dh::safe_cuda(
-      (cub::DispatchSegmentedRadixSort<descending, KeyT, ValueT, BeginOffsetIteratorT,
+      (cub::DispatchSegmentedRadixSort<kCubSortOrder, KeyT, ValueT, BeginOffsetIteratorT,
                                        OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes,
                                                           d_keys, d_values, num_items, num_segments,
                                                           d_begin_offsets, d_end_offsets, begin_bit,
@@ -207,47 +218,48 @@ void ArgSort(Context const *ctx, Span<U> keys, Span<IdxT> sorted_idx) {
   // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
   using OffsetT = std::conditional_t<!dh::BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
   CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
+
   if (accending) {
     void *d_temp_storage = nullptr;
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         nullptr, false)));
 #endif
     dh::TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         nullptr, false)));
 #endif
   } else {
     void *d_temp_storage = nullptr;
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         nullptr, false)));
 #endif
     dh::TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
         d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
         nullptr, false)));
 #endif
@@ -277,6 +289,10 @@ void CopyIf(CUDAContext const *cuctx, InIt in_first, InIt in_second, OutIt out_f
 template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
 void InclusiveScan(xgboost::Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out,
                    ScanOpT scan_op, OffsetT num_items) {
+#if CUB_VERSION >= 300000
+  static_assert(std::is_unsigned_v<OffsetT>, "OffsetT must be unsigned");
+  static_assert(sizeof(OffsetT) >= 4, "OffsetT must be at least 4 bytes long");
+#endif
   auto cuctx = ctx->CUDACtx();
   std::size_t bytes = 0;
 #if THRUST_MAJOR_VERSION >= 2
@@ -304,7 +320,11 @@ void InclusiveScan(xgboost::Context const *ctx, InputIteratorT d_in, OutputItera
 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
 void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out,
                   OffsetT num_items) {
+#if CUB_VERSION >= 300000
+  InclusiveScan(ctx, d_in, d_out, cuda::std::plus{}, num_items);
+#else
   InclusiveScan(ctx, d_in, d_out, cub::Sum{}, num_items);
+#endif
 }
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index ffdafa29205c..078caeee7224 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -45,7 +45,11 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 
   dh::BlockFill(smem_cs_ptr, out_column_size.size(), 0);
 
+#if CUB_VERSION >= 300000
+  __syncthreads();
+#else
   cub::CTA_SYNC();
+#endif
 
   auto n = batch_iter.size();
 
@@ -56,7 +60,11 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
     }
   }
 
+#if CUB_VERSION >= 300000
+  __syncthreads();
+#else
   cub::CTA_SYNC();
+#endif
 
   auto out_global_ptr = out_column_size;
   for (auto i : dh::BlockStrideRange(static_cast<std::size_t>(0), out_column_size.size())) {
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 423cbe9b5253..e8d4f2161b3d 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -115,7 +115,11 @@ class EvaluateSplitAgent {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
       GradientPairInt64 bin = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x)
                                               : GradientPairInt64();
-      BlockScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op);
+#if CUB_VERSION >= 300000
+      BlockScanT(temp_storage->scan).ExclusiveScan(bin, bin, cuda::std::plus{}, prefix_op);
+#else
+      BlockScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum{}, prefix_op);
+#endif
       // Whether the gradient of missing values is put to the left side.
       bool missing_left = true;
       float gain = thread_active ? LossChangeMissing(bin, missing, parent_sum, param, nidx, fidx,
@@ -292,7 +296,11 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
     agent.Numerical(&best_split);
   }
 
+#if CUB_VERSION >= 300000
+  __syncthreads();
+#else
   cub::CTA_SYNC();
+#endif
   if (threadIdx.x == 0) {
     // Record best loss for each feature
     out_candidates[blockIdx.x] = best_split;
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 56fee47061b6..27eb040afa1e 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -173,10 +173,10 @@ void SortPositionBatch(Context const* ctx, common::Span<const PerNodeData<OpData
     // the iteration.
     auto ret =
         cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
-                          cub::NullType, std::int64_t>::Dispatch(nullptr, n_bytes, input_iterator,
+                          cub::NullType, std::uint64_t>::Dispatch(nullptr, n_bytes, input_iterator,
                                                                  discard_write_iterator,
                                                                  IndexFlagOp{}, cub::NullType{},
-                                                                 total_rows,
+                                                                 static_cast<std::uint64_t>(total_rows),
                                                                  ctx->CUDACtx()->Stream());
     dh::safe_cuda(ret);
     tmp->resize(n_bytes);
@@ -184,10 +184,10 @@ void SortPositionBatch(Context const* ctx, common::Span<const PerNodeData<OpData
   n_bytes = tmp->size();
   auto ret =
       cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
-                        cub::NullType, std::int64_t>::Dispatch(tmp->data(), n_bytes, input_iterator,
+                        cub::NullType, std::uint64_t>::Dispatch(tmp->data(), n_bytes, input_iterator,
                                                                discard_write_iterator,
                                                                IndexFlagOp{}, cub::NullType{},
-                                                               total_rows,
+                                                               static_cast<std::uint64_t>(total_rows),
                                                                ctx->CUDACtx()->Stream());
   dh::safe_cuda(ret);
 

From c05f8f220dd216a58001bbd730228c842f4f5084 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Sat, 14 Jun 2025 08:11:32 +0800
Subject: [PATCH 067/224] [jvm-packages] Fix parameters (#11489)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 .../xgboost4j/java/ExtMemQuantileDMatrix.java | 15 +++--
 .../scala/ExtMemQuantileDMatrix.scala         |  5 +-
 .../scala/spark/GpuXGBoostPlugin.scala        |  5 +-
 .../scala/spark/params/XGBoostParams.scala    | 17 +++++-
 .../scala/spark/XGBoostParamsSuite.scala      | 55 +++++++++++++++++++
 5 files changed, 85 insertions(+), 12 deletions(-)
 create mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala

diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java
index 6c9868be8473..8a653b146a60 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/ExtMemQuantileDMatrix.java
@@ -31,7 +31,8 @@ public ExtMemQuantileDMatrix(Iterator<ColumnBatch> iter,
       DMatrix ref,
       int nthread,
       int maxQuantileBatches,
-      int minCachePageBytes) throws XGBoostError {
+      long minCachePageBytes,
+      float cacheHostRatio) throws XGBoostError {
     long[] out = new long[1];
     long[] refHandle = null;
     if (ref != null) {
@@ -39,7 +40,7 @@ public ExtMemQuantileDMatrix(Iterator<ColumnBatch> iter,
       refHandle[0] = ref.getHandle();
     }
     String conf = this.getConfig(missing, maxBin, nthread,
-                                 maxQuantileBatches, minCachePageBytes);
+                                 maxQuantileBatches, minCachePageBytes, cacheHostRatio);
     XGBoostJNI.checkCall(XGBoostJNI.XGExtMemQuantileDMatrixCreateFromCallback(
         iter, refHandle, conf, out));
     handle = out[0];
@@ -50,7 +51,7 @@ public ExtMemQuantileDMatrix(
       float missing,
       int maxBin,
       DMatrix ref) throws XGBoostError {
-    this(iter, missing, maxBin, ref, 0, -1, -1);
+    this(iter, missing, maxBin, ref, 0, -1, -1, Float.NaN);
   }
 
   public ExtMemQuantileDMatrix(
@@ -61,19 +62,23 @@ public ExtMemQuantileDMatrix(
   }
 
   private String getConfig(float missing, int maxBin, int nthread,
-                           int maxQuantileBatches, int minCachePageBytes) {
+                           int maxQuantileBatches, long minCachePageBytes, float cacheHostRatio) {
     Map<String, Object> conf = new java.util.HashMap<>();
     conf.put("missing", missing);
     conf.put("max_bin", maxBin);
     conf.put("nthread", nthread);
 
     if (maxQuantileBatches > 0) {
-      conf.put("max_quantile_batches", maxQuantileBatches);
+      conf.put("max_quantile_blocks", maxQuantileBatches);
     }
     if (minCachePageBytes > 0) {
       conf.put("min_cache_page_bytes", minCachePageBytes);
     }
 
+    if (cacheHostRatio >= 0.0 && cacheHostRatio <= 1.0) {
+      conf.put("cache_host_ratio", cacheHostRatio);
+    }
+
     conf.put("on_host", true);
     conf.put("cache_prefix", ".");
     ObjectMapper mapper = new ObjectMapper();
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala
index 6c870ad06299..d6cd447fde8c 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/ExtMemQuantileDMatrix.scala
@@ -28,10 +28,11 @@ class ExtMemQuantileDMatrix private[scala](
            ref: Option[QuantileDMatrix],
            nthread: Int,
            maxQuantileBatches: Int,
-           minCachePageBytes: Int) {
+           minCachePageBytes: Long,
+           cacheHostRatio: Float) {
     this(new jExtMemQuantileDMatrix(iter.asJava, missing, maxBin,
       ref.map(_.jDMatrix).orNull,
-      nthread, maxQuantileBatches, minCachePageBytes))
+      nthread, maxQuantileBatches, minCachePageBytes, cacheHostRatio))
   }
 
   def this(iter: Iterator[ColumnBatch], missing: Float, maxBin: Int) {
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
index 01a8842e82b4..afbf15066d43 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
@@ -18,6 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import scala.collection.mutable.ArrayBuffer
 import scala.jdk.CollectionConverters._
+import scala.util.Try
 
 import ai.rapids.cudf.Table
 import com.nvidia.spark.rapids.{ColumnarRdd, GpuColumnVectorUtils}
@@ -134,6 +135,7 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
 
     val maxQuantileBatches = estimator.getMaxQuantileBatches
     val minCachePageBytes = estimator.getMinCachePageBytes
+    val cacheHostRatio = Try(estimator.getCacheHostRatio).getOrElse(Float.NaN)
 
     /** build QuantileDMatrix on the executor side */
     def buildQuantileDMatrix(input: Iterator[Table],
@@ -143,7 +145,7 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
         case Some(_) =>
           val itr = new ExternalMemoryIterator(input, indices, extMemPath)
           new ExtMemQuantileDMatrix(itr, missing, maxBin, ref, nthread,
-            maxQuantileBatches, minCachePageBytes)
+            maxQuantileBatches, minCachePageBytes, cacheHostRatio)
 
         case None =>
           val itr = input.map { table =>
@@ -188,7 +190,6 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
 
     val sconf = dataset.sparkSession.conf
     val rmmEnabled: Boolean = try {
-      sconf.get("spark.rapids.memory.gpu.pooling.enabled").toBoolean &&
       sconf.get("spark.rapids.memory.gpu.pool").trim.toLowerCase != "none"
     } catch {
       case _: Throwable => false // Any exception will return false
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
index 2d94aade5ac6..fa814be224a2 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
@@ -193,10 +193,18 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
 
   final def getMaxQuantileBatches: Int = $(maxQuantileBatches)
 
-  final val minCachePageBytes = new IntParam(this, "minCachePageBytes", "Minimum number of " +
+  final val minCachePageBytes = new LongParam(this, "minCachePageBytes", "Minimum number of " +
     "bytes for each ellpack page in cache. Only used for in-host")
 
-  final def getMinCachePageBytes: Int = $(minCachePageBytes)
+  final def getMinCachePageBytes: Long = $(minCachePageBytes)
+
+  final val cacheHostRatio = new FloatParam(this, "cacheHostRatio",
+    "Used by the GPU implementation. For GPU-based inputs, XGBoost can split the cache into " +
+      "host and device caches to reduce the data transfer overhead. This parameter specifies " +
+      "the size of host cache compared to the size of the entire cache: host / (host + device)",
+    ParamValidators.inRange(0.0, 1.0))
+
+  final def getCacheHostRatio: Float = $(cacheHostRatio)
 
   setDefault(numRound -> 100, numWorkers -> 1, inferBatchSize -> (32 << 10),
     numEarlyStoppingRounds -> 0, forceRepartition -> false, missing -> Float.NaN,
@@ -248,7 +256,10 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
 
   def setMaxQuantileBatches(value: Int): T = set(maxQuantileBatches, value).asInstanceOf[T]
 
-  def setMinCachePageBytes(value: Int): T = set(minCachePageBytes, value).asInstanceOf[T]
+  def setMinCachePageBytes(value: Long): T = set(minCachePageBytes, value).asInstanceOf[T]
+
+  def setCacheHostRatio(value: Float): T = set(cacheHostRatio, value)
+    .asInstanceOf[T]
 
   protected[spark] def featureIsArrayType(schema: StructType): Boolean =
     schema(getFeaturesCol).dataType.isInstanceOf[ArrayType]
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala
new file mode 100644
index 000000000000..70b418937338
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala
@@ -0,0 +1,55 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import scala.util.Try
+
+import org.scalatest.funsuite.AnyFunSuite
+
+
+class XGBoostParamsSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
+
+  test("invalid parameters") {
+    val df = smallBinaryClassificationVector
+    val estimator = new XGBoostClassifier()
+
+    // We didn't set it by default
+    var thrown = intercept[RuntimeException] {
+      estimator.getCacheHostRatio
+    }
+    assert(thrown.getMessage.contains("Failed to find a default value for cacheHostRatio"))
+
+    val v = Try(estimator.getCacheHostRatio).getOrElse(Float.NaN)
+    assert(v.equals(Float.NaN))
+
+    // We didn't set it by default
+    thrown = intercept[RuntimeException] {
+      estimator.setCacheHostRatio(-1.0f)
+    }
+    assert(thrown.getMessage.contains("parameter cacheHostRatio given invalid value -1.0"))
+
+    Seq(0.0f, 0.2f, 1.0f).forall(v => {
+      estimator.setCacheHostRatio(v)
+      estimator.getCacheHostRatio == v
+    })
+
+    estimator.setCacheHostRatio(0.66f)
+    val v1 = Try(estimator.getCacheHostRatio).getOrElse(Float.NaN)
+    assert(v1 == 0.66f)
+  }
+
+}

From aeaf15a554f879aaf00800ac1f49b887390cc982 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 14 Jun 2025 21:26:17 -0700
Subject: [PATCH 068/224] Drop deprecated `XGDMatrixCreateFromCSCEx`. (#11513)

---
 include/xgboost/c_api.h               |  8 ----
 src/c_api/c_api.cc                    | 11 -----
 src/common/quantile.cc                |  4 +-
 src/data/adapter.h                    | 66 ---------------------------
 src/data/data.cc                      |  2 -
 src/data/simple_dmatrix.cc            |  5 +-
 tests/cpp/data/test_adapter.cc        | 23 ++++++----
 tests/cpp/data/test_simple_dmatrix.cc | 28 +++++-------
 8 files changed, 29 insertions(+), 118 deletions(-)

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index b1c39ea0c685..477df62f0e97 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -263,14 +263,6 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
 XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
                                    bst_ulong nrow, char const *config, DMatrixHandle *out);
 
-/*!
- * \brief create a matrix content from CSC format
- * \deprecated since 2.0.0
- * \see XGDMatrixCreateFromCSC()
- */
-XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indices,
-                                     const float *data, size_t nindptr, size_t nelem,
-                                     size_t num_row, DMatrixHandle *out);
 
 /*!
  * \brief create matrix content from dense matrix
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d3277db49d09..7fdb6e164fb7 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -572,17 +572,6 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
   API_END();
 }
 
-XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indices,
-                                     const bst_float *data, size_t nindptr, size_t, size_t num_row,
-                                     DMatrixHandle *out) {
-  API_BEGIN();
-  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
-  data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row);
-  xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
-  API_END();
-}
-
 XGB_DLL int XGDMatrixCreateFromMat(const bst_float* data,
                                    xgboost::bst_ulong nrow,
                                    xgboost::bst_ulong ncol, bst_float missing,
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 61e12100b17b..bba5611e9cd6 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #include "quantile.h"
 
@@ -111,7 +111,7 @@ void HostSketchContainer::PushAdapterBatch(Batch const &batch, size_t base_rowid
 
 INSTANTIATE(ArrayAdapterBatch)
 INSTANTIATE(CSRArrayAdapterBatch)
-INSTANTIATE(CSCAdapterBatch)
+INSTANTIATE(CSCArrayAdapterBatch)
 INSTANTIATE(SparsePageAdapterBatch)
 INSTANTIATE(ColumnarAdapterBatch)
 
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 339fbcd90e5d..05b08483370c 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -397,72 +397,6 @@ class CSRArrayAdapter : public detail::SingleBatchDataIter<CSRArrayAdapterBatch>
   size_t num_cols_;
 };
 
-class CSCAdapterBatch : public detail::NoMetaInfo {
- public:
-  CSCAdapterBatch(const size_t* col_ptr, const unsigned* row_idx,
-                  const float* values, size_t num_features)
-      : col_ptr_(col_ptr),
-        row_idx_(row_idx),
-        values_(values),
-        num_features_(num_features) {}
-
- private:
-  class Line {
-   public:
-    Line(size_t col_idx, size_t size, const unsigned* row_idx,
-         const float* values)
-        : col_idx_(col_idx), size_(size), row_idx_(row_idx), values_(values) {}
-
-    size_t Size() const { return size_; }
-    COOTuple GetElement(size_t idx) const {
-      return COOTuple{row_idx_[idx], col_idx_, values_[idx]};
-    }
-
-   private:
-    size_t col_idx_;
-    size_t size_;
-    const unsigned* row_idx_;
-    const float* values_;
-  };
-
- public:
-  size_t Size() const { return num_features_; }
-  const Line GetLine(size_t idx) const {
-    size_t begin_offset = col_ptr_[idx];
-    size_t end_offset = col_ptr_[idx + 1];
-    return Line(idx, end_offset - begin_offset, &row_idx_[begin_offset],
-                &values_[begin_offset]);
-  }
-  static constexpr bool kIsRowMajor = false;
-
- private:
-  const size_t* col_ptr_;
-  const unsigned* row_idx_;
-  const float* values_;
-  size_t num_features_;
-};
-
-class CSCAdapter : public detail::SingleBatchDataIter<CSCAdapterBatch> {
- public:
-  CSCAdapter(const size_t* col_ptr, const unsigned* row_idx,
-             const float* values, size_t num_features, size_t num_rows)
-      : batch_(col_ptr, row_idx, values, num_features),
-        num_rows_(num_rows),
-        num_columns_(num_features) {}
-  const CSCAdapterBatch& Value() const override { return batch_; }
-
-  // JVM package sends 0 as unknown
-  size_t NumRows() const {
-    return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_;
-  }
-  size_t NumColumns() const { return num_columns_; }
-
- private:
-  CSCAdapterBatch batch_;
-  size_t num_rows_;
-  size_t num_columns_;
-};
-
 class CSCArrayAdapterBatch : public detail::NoMetaInfo {
   ArrayInterface<1> indptr_;
   ArrayInterface<1> indices_;
diff --git a/src/data/data.cc b/src/data/data.cc
index 78a74cf6b4c5..91c2d507693e 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -1021,7 +1021,6 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const st
 INSTANTIATION_CREATE(DenseAdapter)
 INSTANTIATION_CREATE(ArrayAdapter)
 INSTANTIATION_CREATE(CSRAdapter)
-INSTANTIATION_CREATE(CSCAdapter)
 INSTANTIATION_CREATE(FileAdapter)
 INSTANTIATION_CREATE(CSRArrayAdapter)
 INSTANTIATION_CREATE(CSCArrayAdapter)
@@ -1298,7 +1297,6 @@ template uint64_t SparsePage::Push(const data::CSRArrayAdapterBatch& batch, floa
                                    int nthread);
 template uint64_t SparsePage::Push(const data::CSCArrayAdapterBatch& batch, float missing,
                                    int nthread);
-template uint64_t SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
 template uint64_t SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread);
 template uint64_t SparsePage::Push(const data::ColumnarAdapterBatch& batch, float missing,
                                    std::int32_t nthread);
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index c25bdf8befc7..e08fa4f93a57 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -315,8 +315,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
         offset_vec.emplace_back(offset_vec.back());
       }
     } else {
-      CHECK((std::is_same_v<AdapterT, CSCAdapter> || std::is_same_v<AdapterT, CSCArrayAdapter>))
-          << "Expecting CSCAdapter";
+      CHECK((std::is_same_v<AdapterT, CSCArrayAdapter>)) << "Expecting a CSC adapter.";
       info_.num_row_ = offset_vec.size() - 1;
     }
   } else {
@@ -366,8 +365,6 @@ template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, i
                                       DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread,
                                       DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
                                       DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(ColumnarAdapter* adapter, float missing, int nthread,
diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc
index 6833dc19e46d..f2d6a13f9050 100644
--- a/tests/cpp/data/test_adapter.cc
+++ b/tests/cpp/data/test_adapter.cc
@@ -1,13 +1,15 @@
-// Copyright (c) 2019-2021 by XGBoost Contributors
+/**
+ *  Copyright 2019-2025, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
+#include <xgboost/data.h>
+
 #include <type_traits>
 #include <utility>
-#include <xgboost/data.h>
+
 #include "../../../src/data/adapter.h"
 #include "../../../src/data/simple_dmatrix.h"
-#include "../../../src/common/timer.h"
 #include "../helpers.h"
-
 #include "xgboost/base.h"
 #include "xgboost/c_api.h"
 
@@ -58,11 +60,16 @@ TEST(Adapter, CSRArrayAdapter) {
 }
 
 TEST(Adapter, CSCAdapterColsMoreThanRows) {
-  std::vector<float> data = {1, 2, 3, 4, 5, 6, 7, 8};
-  std::vector<unsigned> row_idx = {0, 1, 0, 1, 0, 1, 0, 1};
-  std::vector<size_t> col_ptr = {0, 2, 4, 6, 8};
+  HostDeviceVector<float> data{1, 2, 3, 4, 5, 6, 7, 8};
+  HostDeviceVector<unsigned> row_idx{0, 1, 0, 1, 0, 1, 0, 1};
+  HostDeviceVector<size_t> col_ptr{0, 2, 4, 6, 8};
+
+  auto j_data = Json::Dump(GetArrayInterface(&data, data.Size(), 1));
+  auto j_row_idx = Json::Dump(GetArrayInterface(&row_idx, row_idx.Size(), 1));
+  auto j_col_ptr = Json::Dump(GetArrayInterface(&col_ptr, col_ptr.Size(), 1));
+
+  data::CSCArrayAdapter adapter{j_col_ptr, j_row_idx, j_data, 0};
   // Infer row count
-  data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 4, 0);
   data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1);
   EXPECT_EQ(dmat.Info().num_col_, 4);
   EXPECT_EQ(dmat.Info().num_row_, 2);
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index 16448c2e197f..1151a3dac96e 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2024, XGBoost Contributors
+ * Copyright 2016-2025, XGBoost Contributors
  */
 #include <xgboost/data.h>
 
@@ -106,16 +106,6 @@ TEST(SimpleDMatrix, Empty) {
   for (auto &batch : dmat->GetBatches<SparsePage>()) {
     CHECK_EQ(batch.Size(), 0);
   }
-
-  data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
-  dmat.reset(new data::SimpleDMatrix(
-      &csc_adapter, std::numeric_limits<float>::quiet_NaN(), 1));
-  CHECK_EQ(dmat->Info().num_nonzero_, 0);
-  CHECK_EQ(dmat->Info().num_row_, 0);
-  CHECK_EQ(dmat->Info().num_col_, 0);
-  for (auto &batch : dmat->GetBatches<SparsePage>()) {
-    CHECK_EQ(batch.Size(), 0);
-  }
 }
 
 TEST(SimpleDMatrix, MissingData) {
@@ -178,12 +168,16 @@ TEST(SimpleDMatrix, FromDense) {
 }
 
 TEST(SimpleDMatrix, FromCSC) {
-  std::vector<float> data = {1, 3, 2, 4, 5};
-  std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
-  std::vector<size_t> col_ptr = {0, 2, 5};
-  data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
-  data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-                           -1);
+  HostDeviceVector<float> data{1, 3, 2, 4, 5};
+  HostDeviceVector<unsigned> row_idx{0, 1, 0, 1, 2};
+  HostDeviceVector<size_t> col_ptr{0, 2, 5};
+
+  auto j_data = Json::Dump(GetArrayInterface(&data, data.Size(), 1));
+  auto j_row_idx = Json::Dump(GetArrayInterface(&row_idx, row_idx.Size(), 1));
+  auto j_col_ptr = Json::Dump(GetArrayInterface(&col_ptr, col_ptr.Size(), 1));
+
+  data::CSCArrayAdapter adapter{j_col_ptr, j_row_idx, j_data, 3};
+  data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1);
   EXPECT_EQ(dmat.Info().num_col_, 2);
   EXPECT_EQ(dmat.Info().num_row_, 3);
   EXPECT_EQ(dmat.Info().num_nonzero_, 5);

From f1871a40b8f07a7eecdd520137cb95605b084acb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 16 Jun 2025 11:15:59 -0700
Subject: [PATCH 069/224] [EM] Small fixes for hardware decompression. (#11512)

- Use a memory pool for result allocation.
- Copy the parameters.
---
 src/common/cuda_rt_utils.cc                 |  6 +--
 src/common/cuda_rt_utils.h                  |  4 +-
 src/common/device_compression.cu            | 52 +++++++++++++++------
 src/common/device_compression.cuh           | 18 ++++---
 src/common/device_compression.h             |  6 +++
 src/data/sparse_page_source.h               |  2 +-
 tests/cpp/common/test_device_compression.cu |  5 +-
 7 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
index dd060d52fded..429a1444a108 100644
--- a/src/common/cuda_rt_utils.cc
+++ b/src/common/cuda_rt_utils.cc
@@ -42,13 +42,13 @@ std::int32_t CurrentDevice(bool raise) {
 }
 
 // alternatively: `nvidia-smi -q | grep Addressing`
-bool SupportsPageableMem() {
+[[nodiscard]] bool SupportsPageableMem() {
   std::int32_t res{0};
   dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccess, CurrentDevice()));
   return res == 1;
 }
 
-bool SupportsAts() {
+[[nodiscard]] bool SupportsAts() {
   std::int32_t res{0};
   dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccessUsesHostPageTables,
                                        CurrentDevice()));
@@ -106,7 +106,7 @@ void DrVersion(std::int32_t* major, std::int32_t* minor) {
 
 [[nodiscard]] std::int32_t GetNumaId() {
   std::int32_t numa_id = -1;
-  dh::safe_cuda(cudaDeviceGetAttribute(&numa_id, cudaDevAttrNumaId, curt::CurrentDevice()));
+  dh::safe_cuda(cudaDeviceGetAttribute(&numa_id, cudaDevAttrHostNumaId, curt::CurrentDevice()));
   numa_id = std::max(numa_id, 0);
   return numa_id;
 }
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
index 9d6fa3d61db1..df476521497a 100644
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@@ -15,10 +15,10 @@ std::int32_t CurrentDevice(bool raise = true);
 
 // Whether the device supports coherently accessing pageable memory without calling
 // `cudaHostRegister` on it
-bool SupportsPageableMem();
+[[nodiscard]] bool SupportsPageableMem();
 
 // Address Translation Service (ATS)
-bool SupportsAts();
+[[nodiscard]] bool SupportsAts();
 
 void CheckComputeCapability();
 
diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu
index 531a9881215e..80dc23c2c9ba 100644
--- a/src/common/device_compression.cu
+++ b/src/common/device_compression.cu
@@ -25,10 +25,9 @@
 #include <mutex>      // for once_flag, call_once
 #include <vector>     // for vector
 
-#include "compressed_iterator.h"    // for CompressedByteT
-#include "cuda_context.cuh"         // for CUDAContext
-#include "cuda_dr_utils.h"          // for GetGlobalCuDriverApi
-#include "cuda_pinned_allocator.h"  // for HostPinnedMemPool
+#include "compressed_iterator.h"  // for CompressedByteT
+#include "cuda_context.cuh"       // for CUDAContext
+#include "cuda_dr_utils.h"        // for GetGlobalCuDriverApi
 #include "device_compression.h"
 #include "device_vector.cuh"      // for DeviceUVector
 #include "nvtx_utils.h"           // for xgboost_NVTX_FN_RANGE
@@ -154,9 +153,11 @@ void SafeNvComp(nvcompStatus_t status) {
   return de;
 }
 
-SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(
-    dh::CUDAStreamView s, std::shared_ptr<common::cuda_impl::HostPinnedMemPool> pool,
-    CuMemParams params, common::Span<std::uint8_t const> in_compressed_data) {
+SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(dh::CUDAStreamView s,
+                                           std::shared_ptr<HostPinnedMemPool> pool,
+                                           CuMemParams params,
+                                           common::Span<std::uint8_t const> in_compressed_data)
+    : n_dst_bytes{params.TotalDstBytes()} {
   std::size_t n_chunks = params.size();
   if (n_chunks == 0) {
     return;
@@ -177,6 +178,7 @@ SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(
     last_in += params[i].src_nbytes;
     last_out += params[i].dst_nbytes;
   }
+  CHECK_EQ(this->n_dst_bytes, last_out);
 
   // copy to d
   dh::CopyTo(in_chunk_ptrs, &this->d_in_chunk_ptrs, s);
@@ -210,6 +212,7 @@ SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(
 
 common::Span<CUmemDecompressParams> SnappyDecomprMgrImpl::GetParams(
     common::Span<common::CompressedByteT> out) {
+  xgboost_NVTX_FN_RANGE_C(3, 252, 198);
   if (this->de_params.empty()) {
     return {};
   }
@@ -218,6 +221,7 @@ common::Span<CUmemDecompressParams> SnappyDecomprMgrImpl::GetParams(
   // Set the output buffers.
   std::size_t last_out = 0;
   for (std::size_t i = 0; i < n_chunks; ++i) {
+    this->de_params_copy[i] = this->de_params[i];
     this->de_params_copy[i].dst = out.subspan(last_out, de_params[i].dstNumBytes).data();
     last_out += de_params[i].dstNumBytes;
   }
@@ -225,12 +229,26 @@ common::Span<CUmemDecompressParams> SnappyDecomprMgrImpl::GetParams(
   return this->de_params_copy.ToSpan();
 }
 
+[[nodiscard]] bool SnappyDecomprMgrImpl::Empty() const {
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+  return this->de_params.empty();
+#else
+  return true;
+#endif
+}
+
 SnappyDecomprMgr::SnappyDecomprMgr() : pimpl_{std::make_unique<SnappyDecomprMgrImpl>()} {}
 SnappyDecomprMgr::SnappyDecomprMgr(SnappyDecomprMgr&& that) = default;
 SnappyDecomprMgr& SnappyDecomprMgr::operator=(SnappyDecomprMgr&& that) = default;
 
 SnappyDecomprMgr::~SnappyDecomprMgr() = default;
 
+[[nodiscard]] bool SnappyDecomprMgr::Empty() const { return this->Impl()->Empty(); }
+
+[[nodiscard]] std::size_t SnappyDecomprMgr::DecompressedBytes() const {
+  return this->Impl()->n_dst_bytes;
+}
+
 SnappyDecomprMgrImpl* SnappyDecomprMgr::Impl() const { return this->pimpl_.get(); }
 
 void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
@@ -272,7 +290,7 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
     dh::device_vector<void*> d_out_ptrs(n_chunks);
     dh::safe_cuda(cudaMemcpyAsync(d_out_ptrs.data().get(), h_out_ptrs.data(),
                                   dh::ToSpan(d_out_ptrs).size_bytes(), cudaMemcpyDefault, stream));
-
+    CHECK(curt::SupportsPageableMem() || curt::SupportsAts());
     // Run nvcomp
     SafeNvComp(nvcompBatchedSnappyDecompressAsync(
         mgr_impl->d_in_chunk_ptrs.data().get(), mgr_impl->d_in_chunk_sizes.data().get(),
@@ -387,8 +405,9 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
 }
 
 [[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
-    dh::CUDAStreamView stream, CuMemParams const& in_params,
-    dh::DeviceUVector<std::uint8_t> const& in_buf, CuMemParams* p_out) {
+    dh::CUDAStreamView stream, std::shared_ptr<HostPinnedMemPool> pool,
+    CuMemParams const& in_params, dh::DeviceUVector<std::uint8_t> const& in_buf,
+    CuMemParams* p_out) {
   std::size_t n_total_act_bytes = in_params.TotalSrcActBytes();
   std::size_t n_total_bytes = in_params.TotalSrcBytes();
   if (n_total_bytes == 0) {
@@ -399,8 +418,8 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
   // copy from device buffer to the host cache.
   CHECK_EQ(n_total_bytes, in_buf.size());
   auto c_page =
-      common::MakeFixedVecWithPinnedMalloc<std::remove_reference_t<decltype(in_buf)>::value_type>(
-          n_total_act_bytes);
+      common::MakeFixedVecWithPinnedMemPool<std::remove_reference_t<decltype(in_buf)>::value_type>(
+          pool, n_total_act_bytes, stream);
   std::vector<std::uint8_t const*> srcs(in_params.size());
   std::vector<std::uint8_t*> dsts(in_params.size());
   std::vector<std::size_t> sizes(in_params.size());
@@ -450,6 +469,12 @@ SnappyDecomprMgr& SnappyDecomprMgr::operator=(SnappyDecomprMgr&& that) = default
 SnappyDecomprMgr::~SnappyDecomprMgr() = default;
 SnappyDecomprMgrImpl* SnappyDecomprMgr::Impl() const { return nullptr; }
 
+[[nodiscard]] bool SnappyDecomprMgr::Empty() const { return true; }
+[[nodiscard]] std::size_t SnappyDecomprMgr::DecompressedBytes() const {
+  common::AssertNvCompSupport();
+  return 0;
+}
+
 // Round-trip compression
 void DecompressSnappy(dh::CUDAStreamView, SnappyDecomprMgr const&,
                       common::Span<common::CompressedByteT>, bool) {
@@ -464,7 +489,8 @@ void DecompressSnappy(dh::CUDAStreamView, SnappyDecomprMgr const&,
 }
 
 [[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
-    dh::CUDAStreamView, CuMemParams const&, dh::DeviceUVector<std::uint8_t> const&, CuMemParams*) {
+    dh::CUDAStreamView, std::shared_ptr<HostPinnedMemPool>, CuMemParams const&,
+    dh::DeviceUVector<std::uint8_t> const&, CuMemParams*) {
   common::AssertNvCompSupport();
   return {};
 }
diff --git a/src/common/device_compression.cuh b/src/common/device_compression.cuh
index b02df53cd8ce..626e1b26d03a 100644
--- a/src/common/device_compression.cuh
+++ b/src/common/device_compression.cuh
@@ -15,6 +15,9 @@
 #include "xgboost/span.h"           // for Span
 
 namespace xgboost::dc {
+
+using HostPinnedMemPool = common::cuda_impl::HostPinnedMemPool;
+
 /**
  * @brief Use nvcomp to compress the data.
  *
@@ -44,19 +47,22 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
  * @brief Coalesce the compressed chunks into a contiguous host pinned buffer.
  *
  * @param stream CUDA stream.
+ * @param pool Pinned memory pool for storing the results.
  * @param in_params Params from @ref CompressSnappy, specifies the chunks.
  * @param in_buf The buffer storing compressed chunks.
  * @param p_out Re-newed parameters to keep track of the buffers.
  */
 [[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
-    dh::CUDAStreamView stream, CuMemParams const& in_params,
-    dh::DeviceUVector<std::uint8_t> const& in_buf, CuMemParams* p_out);
+    dh::CUDAStreamView stream, std::shared_ptr<HostPinnedMemPool> pool,
+    CuMemParams const& in_params, dh::DeviceUVector<std::uint8_t> const& in_buf,
+    CuMemParams* p_out);
 
 // We store decompression parameters in struct of vectors. This is due to nvcomp works
 // with this format. But the CUDA driver works with vector of structs. We can optimize
 // toward the driver decompression function if the overhead is significant (too many
 // chunks).
 struct SnappyDecomprMgrImpl {
+  std::size_t n_dst_bytes{0};
   // src of the CUmemDecompressParams
   dh::device_vector<void const*> d_in_chunk_ptrs;
   // srcNumBytes of the CUmemDecompressParams
@@ -81,8 +87,7 @@ struct SnappyDecomprMgrImpl {
 #endif  // defined(CUDA_HW_DECOM_AVAILABLE)
   }
 
-  SnappyDecomprMgrImpl(dh::CUDAStreamView s,
-                       std::shared_ptr<common::cuda_impl::HostPinnedMemPool> pool,
+  SnappyDecomprMgrImpl(dh::CUDAStreamView s, std::shared_ptr<HostPinnedMemPool> pool,
                        CuMemParams params, common::Span<std::uint8_t const> in_compressed_data);
 
 #if defined(CUDA_HW_DECOM_AVAILABLE) && defined(XGBOOST_USE_NVCOMP)
@@ -95,10 +100,11 @@ struct SnappyDecomprMgrImpl {
   SnappyDecomprMgrImpl(SnappyDecomprMgrImpl&& that) = default;
   SnappyDecomprMgrImpl& operator=(SnappyDecomprMgrImpl const&) = delete;
   SnappyDecomprMgrImpl& operator=(SnappyDecomprMgrImpl&&) = default;
+
+  [[nodiscard]] bool Empty() const;
 };
 
-inline auto MakeSnappyDecomprMgr(dh::CUDAStreamView s,
-                                 std::shared_ptr<common::cuda_impl::HostPinnedMemPool> pool,
+inline auto MakeSnappyDecomprMgr(dh::CUDAStreamView s, std::shared_ptr<HostPinnedMemPool> pool,
                                  CuMemParams params,
                                  common::Span<std::uint8_t const> in_compressed_data) {
   SnappyDecomprMgr mgr;
diff --git a/src/common/device_compression.h b/src/common/device_compression.h
index 357995098042..beebc1a0343e 100644
--- a/src/common/device_compression.h
+++ b/src/common/device_compression.h
@@ -112,6 +112,12 @@ class SnappyDecomprMgr {
 
   SnappyDecomprMgrImpl* Impl() const;
 
+  [[nodiscard]] bool Empty() const;
+  /**
+   * @brief The number of bytes of the uncompressed data.
+   */
+  [[nodiscard]] std::size_t DecompressedBytes() const;
+
  private:
   // Hide the CUDA API calls.
 #if defined(XGBOOST_USE_NVCOMP)
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 531b477b55c7..1584c407964b 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -362,7 +362,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
 
     timer.Stop();
     if (bytes != InvalidPageSize()) {
-      // Not entirely accurate, the kernels doesn't have to flush the data.
+      // Not entirely accurate, the kernel doesn't have to flush the data.
       LOG(INFO) << common::HumanMemUnit(bytes) << " written in " << timer.ElapsedSeconds()
                 << " seconds.";
       cache_info_->Push(bytes);
diff --git a/tests/cpp/common/test_device_compression.cu b/tests/cpp/common/test_device_compression.cu
index 9e92cb60c9ad..99d246b0325b 100644
--- a/tests/cpp/common/test_device_compression.cu
+++ b/tests/cpp/common/test_device_compression.cu
@@ -71,11 +71,12 @@ class TestNvComp : public ::testing::TestWithParam<std::tuple<std::size_t, std::
       ASSERT_GE(params.size(), n_bytes / n_chunk_bytes);
     }
 
+    auto pool = std::make_shared<common::cuda_impl::HostPinnedMemPool>();
+
     CuMemParams out_params;
-    auto page = CoalesceCompressedBuffersToHost(cuctx->Stream(), params, compr, &out_params);
+    auto page = CoalesceCompressedBuffersToHost(cuctx->Stream(), pool, params, compr, &out_params);
 
     dh::device_vector<common::CompressedByteT> dout(in.size(), 0);
-    auto pool = std::make_shared<common::cuda_impl::HostPinnedMemPool>();
     auto mgr = MakeSnappyDecomprMgr(cuctx->Stream(), pool, out_params, page.ToSpan());
     DecompressSnappy(cuctx->Stream(), mgr, dh::ToSpan(dout), true);
 

From 6d3b86f37ea34743500dc80a52e851520658858d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 18 Jun 2025 05:31:30 -0700
Subject: [PATCH 070/224] Remove deprecated `XGDMatrixCreateFromCSREx`.
 (#11514)

---
 include/xgboost/c_api.h               | 11 +---
 src/c_api/c_api.cc                    | 10 ---
 src/data/adapter.h                    | 91 +++++----------------------
 src/data/data.cc                      | 25 ++++----
 src/data/simple_dmatrix.cc            | 33 +++++-----
 src/predictor/cpu_predictor.cc        |  3 -
 tests/cpp/data/test_adapter.cc        | 89 +++++++++++++-------------
 tests/cpp/data/test_simple_dmatrix.cc | 73 ++++++++++++---------
 tests/cpp/helpers.cc                  | 11 +++-
 9 files changed, 145 insertions(+), 201 deletions(-)

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 477df62f0e97..1effda692bf7 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2024, XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file c_api.h
  * \author Tianqi Chen
  * \brief C API of XGBoost, used for interfacing to other languages.
@@ -149,15 +149,6 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
  */
 XGB_DLL int XGDMatrixCreateFromURI(char const *config, DMatrixHandle *out);
 
-
-/*!
- * \brief create a matrix content from CSR format
- * \deprecated since 2.0.0
- * \see XGDMatrixCreateFromCSR()
- */
-XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices,
-                                     const float *data, size_t nindptr, size_t nelem,
-                                     size_t num_col, DMatrixHandle *out);
 /**
  * @brief Create a DMatrix from columnar data. (table)
  *
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 7fdb6e164fb7..2ccd905af318 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -483,16 +483,6 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr, c
 
 // End Create from data iterator
 
-XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices,
-                                     const bst_float *data, size_t nindptr, size_t nelem,
-                                     size_t num_col, DMatrixHandle *out) {
-  API_BEGIN();
-  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
-  data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
-  API_END();
-}
-
 XGB_DLL int XGDMatrixCreateFromColumnar(char const *data, char const *c_json_config,
                                         DMatrixHandle *out) {
   API_BEGIN();
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 05b08483370c..f0ededaf06e5 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -131,66 +131,8 @@ class NoMetaInfo {
   const uint64_t* Qid() const { return nullptr; }
   const float* BaseMargin() const { return nullptr; }
 };
-
 };  // namespace detail
 
-class CSRAdapterBatch : public detail::NoMetaInfo {
- public:
-  class Line {
-   public:
-    Line(bst_idx_t row_idx, bst_idx_t size, const unsigned* feature_idx, const float* values)
-        : row_idx_(row_idx), size_(size), feature_idx_(feature_idx), values_(values) {}
-
-    size_t Size() const { return size_; }
-    COOTuple GetElement(size_t idx) const {
-      return COOTuple{row_idx_, feature_idx_[idx], values_[idx]};
-    }
-
-   private:
-    bst_idx_t row_idx_;
-    bst_idx_t size_;
-    const unsigned* feature_idx_;
-    const float* values_;
-  };
-  CSRAdapterBatch(const size_t* row_ptr, const unsigned* feature_idx,
-                  const float* values, size_t num_rows, size_t, size_t)
-      : row_ptr_(row_ptr),
-        feature_idx_(feature_idx),
-        values_(values),
-        num_rows_(num_rows) {}
-  const Line GetLine(size_t idx) const {
-    size_t begin_offset = row_ptr_[idx];
-    size_t end_offset = row_ptr_[idx + 1];
-    return Line(idx, end_offset - begin_offset, &feature_idx_[begin_offset],
-                &values_[begin_offset]);
-  }
-  size_t Size() const { return num_rows_; }
-  static constexpr bool kIsRowMajor = true;
-
- private:
-  const size_t* row_ptr_;
-  const unsigned* feature_idx_;
-  const float* values_;
-  size_t num_rows_;
-};
-
-class CSRAdapter : public detail::SingleBatchDataIter<CSRAdapterBatch> {
- public:
-  CSRAdapter(const size_t* row_ptr, const unsigned* feature_idx, const float* values,
-             bst_idx_t num_rows, bst_idx_t num_elements, size_t num_features)
-      : batch_(row_ptr, feature_idx, values, num_rows, num_elements, num_features),
-        num_rows_(num_rows),
-        num_columns_(num_features) {}
-  const CSRAdapterBatch& Value() const override { return batch_; }
-  bst_idx_t NumRows() const { return num_rows_; }
-  bst_idx_t NumColumns() const { return num_columns_; }
-
- private:
-  CSRAdapterBatch batch_;
-  bst_idx_t num_rows_;
-  bst_idx_t num_columns_;
-};
-
 class DenseAdapterBatch : public detail::NoMetaInfo {
  public:
   DenseAdapterBatch(const float* values, bst_idx_t num_rows, bst_idx_t num_features)
@@ -340,15 +282,15 @@ class CSRArrayAdapterBatch : public detail::NoMetaInfo {
         n_features_{n_features} {
   }
 
-  size_t NumRows() const {
+  [[nodiscard]] std::size_t NumRows() const {
     size_t size = indptr_.Shape<0>();
     size = size == 0 ? 0 : size - 1;
     return size;
   }
-  size_t NumCols() const { return n_features_; }
-  size_t Size() const { return this->NumRows(); }
+  [[nodiscard]] std::size_t NumCols() const { return n_features_; }
+  [[nodiscard]] std::size_t Size() const { return this->NumRows(); }
 
-  Line const GetLine(size_t idx) const {
+  [[nodiscard]] Line const GetLine(size_t idx) const {
     auto begin_no_stride = TypedIndex<size_t, 1>{indptr_}(idx);
     auto end_no_stride = TypedIndex<size_t, 1>{indptr_}(idx + 1);
 
@@ -366,9 +308,7 @@ class CSRArrayAdapterBatch : public detail::NoMetaInfo {
 };
 
 /**
- * Adapter for CSR array on host, in Python that's `scipy.sparse.csr_matrix`.  This is
- * similar to `CSRAdapter`, but supports __array_interface__ instead of raw pointers.  An
- * advantage is this can handle various data type without making a copy.
+ * @brief Adapter for CSR array on host, in Python that's `scipy.sparse.csr_matrix`.
  */
 class CSRArrayAdapter : public detail::SingleBatchDataIter<CSRArrayAdapterBatch> {
  public:
@@ -379,15 +319,13 @@ class CSRArrayAdapter : public detail::SingleBatchDataIter<CSRArrayAdapterBatch>
                                   static_cast<bst_feature_t>(num_cols_)};
   }
 
-  CSRArrayAdapterBatch const& Value() const override {
-    return batch_;
-  }
-  size_t NumRows() const {
+  [[nodiscard]] CSRArrayAdapterBatch const& Value() const override { return batch_; }
+  [[nodiscard]] std::size_t NumRows() const {
     size_t size = indptr_.Shape<0>();
     size = size == 0 ? 0 : size - 1;
     return size;
   }
-  size_t NumColumns() const { return num_cols_; }
+  [[nodiscard]] std::size_t NumColumns() const { return num_cols_; }
 
  private:
   CSRArrayAdapterBatch batch_;
@@ -415,8 +353,8 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
           values_{std::move(values)},
           offset_{offset} {}
 
-    std::size_t Size() const { return values_.Shape<0>(); }
-    COOTuple GetElement(std::size_t idx) const {
+    [[nodiscard]] std::size_t Size() const { return values_.Shape<0>(); }
+    [[nodiscard]] COOTuple GetElement(std::size_t idx) const {
       return {TypedIndex<std::size_t, 1>{row_idx_}(offset_ + idx), column_idx_,
               values_(offset_ + idx)};
     }
@@ -429,8 +367,11 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
                        ArrayInterface<1> values)
       : indptr_{std::move(indptr)}, indices_{std::move(indices)}, values_{std::move(values)} {}
 
-  std::size_t Size() const { return indptr_.n - 1; }
-  Line GetLine(std::size_t idx) const {
+  [[nodiscard]] std::size_t Size() const noexcept(true) {
+    auto n = indptr_.n;
+    return (n == 0) ? n : (n - 1);
+  }
+  [[nodiscard]] Line GetLine(std::size_t idx) const {
     auto begin_no_stride = TypedIndex<std::size_t, 1>{indptr_}(idx);
     auto end_no_stride = TypedIndex<std::size_t, 1>{indptr_}(idx + 1);
 
@@ -447,7 +388,7 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
 };
 
 /**
- * \brief CSC adapter with support for array interface.
+ * @brief CSC adapter with support for array interface.
  */
 class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch> {
   ArrayInterface<1> indptr_;
diff --git a/src/data/data.cc b/src/data/data.cc
index 91c2d507693e..b8f0e409ffe2 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -1020,7 +1020,6 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const st
 
 INSTANTIATION_CREATE(DenseAdapter)
 INSTANTIATION_CREATE(ArrayAdapter)
-INSTANTIATION_CREATE(CSRAdapter)
 INSTANTIATION_CREATE(FileAdapter)
 INSTANTIATION_CREATE(CSRArrayAdapter)
 INSTANTIATION_CREATE(CSCArrayAdapter)
@@ -1288,18 +1287,18 @@ void SparsePage::PushCSC(const SparsePage &batch) {
   self_offset = std::move(offset);
 }
 
-template uint64_t SparsePage::Push(const data::DenseAdapterBatch& batch, float missing,
-                                   int nthread);
-template uint64_t SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing,
-                                   int nthread);
-template uint64_t SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
-template uint64_t SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing,
-                                   int nthread);
-template uint64_t SparsePage::Push(const data::CSCArrayAdapterBatch& batch, float missing,
-                                   int nthread);
-template uint64_t SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread);
-template uint64_t SparsePage::Push(const data::ColumnarAdapterBatch& batch, float missing,
-                                   std::int32_t nthread);
+#define INSTANTIATE_PUSH(__BATCH_T)                                                    \
+  template std::uint64_t SparsePage::Push(const data::__BATCH_T& batch, float missing, \
+                                          std::int32_t nthread);
+
+INSTANTIATE_PUSH(DenseAdapterBatch)
+INSTANTIATE_PUSH(ArrayAdapterBatch)
+INSTANTIATE_PUSH(CSRArrayAdapterBatch)
+INSTANTIATE_PUSH(CSCArrayAdapterBatch)
+INSTANTIATE_PUSH(FileAdapterBatch)
+INSTANTIATE_PUSH(ColumnarAdapterBatch)
+
+#undef INSTANTIATE_PUSH
 
 namespace data {
 // List of files that will be force linked in static links.
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index e08fa4f93a57..f5aa4e37c609 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -355,21 +355,20 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
   fo->Write(sparse_page_->data.HostVector());
 }
 
-template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(ColumnarAdapter* adapter, float missing, int nthread,
-                                      DataSplitMode data_split_mode);
-template SimpleDMatrix::SimpleDMatrix(
-    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
-    float missing, int nthread, DataSplitMode data_split_mode);
+#define INSTANTIATE_SDCTOR(__ADAPTER_T)                                                            \
+  template SimpleDMatrix::SimpleDMatrix(__ADAPTER_T* adapter, float missing, std::int32_t nthread, \
+                                        DataSplitMode data_split_mode);
+
+INSTANTIATE_SDCTOR(DenseAdapter)
+INSTANTIATE_SDCTOR(ArrayAdapter)
+INSTANTIATE_SDCTOR(CSRArrayAdapter)
+INSTANTIATE_SDCTOR(CSCArrayAdapter)
+INSTANTIATE_SDCTOR(FileAdapter)
+INSTANTIATE_SDCTOR(ColumnarAdapter)
+namespace {
+using IterAdapterT = IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
+}
+INSTANTIATE_SDCTOR(IterAdapterT)
+
+#undef INSTANTIATE_SDCTOR
 }  // namespace xgboost::data
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index c82ece98d83c..18759edd8e95 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -870,9 +870,6 @@ class CPUPredictor : public Predictor {
     if (x.type() == typeid(std::shared_ptr<data::DenseAdapter>)) {
       this->DispatchedInplacePredict<data::DenseAdapter>(x, p_m, model, missing, out_preds,
                                                          tree_begin, tree_end);
-    } else if (x.type() == typeid(std::shared_ptr<data::CSRAdapter>)) {
-      this->DispatchedInplacePredict<data::CSRAdapter>(x, p_m, model, missing, out_preds,
-                                                       tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::ArrayAdapter>)) {
       this->DispatchedInplacePredict<data::ArrayAdapter>(x, p_m, model, missing, out_preds,
                                                          tree_begin, tree_end);
diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc
index f2d6a13f9050..cc8728cef7b0 100644
--- a/tests/cpp/data/test_adapter.cc
+++ b/tests/cpp/data/test_adapter.cc
@@ -14,49 +14,54 @@
 #include "xgboost/c_api.h"
 
 namespace xgboost {
-TEST(Adapter, CSRAdapter) {
-  int n = 2;
-  std::vector<float> data = {1, 2, 3, 4, 5};
-  std::vector<unsigned> feature_idx = {0, 1, 0, 1, 1};
-  std::vector<size_t> row_ptr = {0, 2, 4, 5};
-  data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(),
-                           row_ptr.size() - 1, data.size(), n);
-  adapter.Next();
-  auto & batch = adapter.Value();
-  auto line0 = batch.GetLine(0);
-  EXPECT_EQ(line0.GetElement(0).value, 1);
-  EXPECT_EQ(line0.GetElement(1).value, 2);
-
-  auto line1 = batch.GetLine(1);
-  EXPECT_EQ(line1.GetElement(0).value, 3);
-  EXPECT_EQ(line1.GetElement(1).value, 4);
-
-  auto line2 = batch.GetLine(2);
-  EXPECT_EQ(line2.GetElement(0).value, 5);
-  EXPECT_EQ(line2.GetElement(0).row_idx, 2);
-  EXPECT_EQ(line2.GetElement(0).column_idx, 1);
-}
-
 TEST(Adapter, CSRArrayAdapter) {
-  HostDeviceVector<std::size_t> indptr;
-  HostDeviceVector<float> values;
-  HostDeviceVector<bst_feature_t> indices;
-  size_t n_features = 100, n_samples = 10;
-  RandomDataGenerator{n_samples, n_features, 0.5}.GenerateCSR(&values, &indptr, &indices);
-  using linalg::MakeVec;
-  auto indptr_arr = ArrayInterfaceStr(MakeVec(indptr.HostPointer(), indptr.Size()));
-  auto values_arr = ArrayInterfaceStr(MakeVec(values.HostPointer(), values.Size()));
-  auto indices_arr = ArrayInterfaceStr(MakeVec(indices.HostPointer(), indices.Size()));
-  auto adapter = data::CSRArrayAdapter(
-      StringView{indptr_arr.c_str(), indptr_arr.size()},
-      StringView{values_arr.c_str(), values_arr.size()},
-      StringView{indices_arr.c_str(), indices_arr.size()}, n_features);
-  auto batch = adapter.Value();
-  ASSERT_EQ(batch.NumRows(), n_samples);
-  ASSERT_EQ(batch.NumCols(), n_features);
-
-  ASSERT_EQ(adapter.NumRows(), n_samples);
-  ASSERT_EQ(adapter.NumColumns(), n_features);
+  {
+    std::size_t n = 2;
+    HostDeviceVector<float> data{1, 2, 3, 4, 5};
+    HostDeviceVector<unsigned> feature_idx{0, 1, 0, 1, 1};
+    HostDeviceVector<size_t> row_ptr{0, 2, 4, 5};
+
+    auto j_data = Json::Dump(GetArrayInterface(&data, data.Size(), 1));
+    auto j_feature_idx = Json::Dump(GetArrayInterface(&feature_idx, feature_idx.Size(), 1));
+    auto j_row_ptr = Json::Dump(GetArrayInterface(&row_ptr, row_ptr.Size(), 1));
+
+    data::CSRArrayAdapter adapter{j_row_ptr, j_feature_idx, j_data, n};
+    adapter.Next();
+    auto &batch = adapter.Value();
+    auto line0 = batch.GetLine(0);
+    EXPECT_EQ(line0.GetElement(0).value, 1);
+    EXPECT_EQ(line0.GetElement(1).value, 2);
+
+    auto line1 = batch.GetLine(1);
+    EXPECT_EQ(line1.GetElement(0).value, 3);
+    EXPECT_EQ(line1.GetElement(1).value, 4);
+
+    auto line2 = batch.GetLine(2);
+    EXPECT_EQ(line2.GetElement(0).value, 5);
+    EXPECT_EQ(line2.GetElement(0).row_idx, 2);
+    EXPECT_EQ(line2.GetElement(0).column_idx, 1);
+  }
+  {
+    HostDeviceVector<std::size_t> indptr;
+    HostDeviceVector<float> values;
+    HostDeviceVector<bst_feature_t> indices;
+    size_t n_features = 100, n_samples = 10;
+    RandomDataGenerator{n_samples, n_features, 0.5}.GenerateCSR(&values, &indptr, &indices);
+    using linalg::MakeVec;
+    auto indptr_arr = ArrayInterfaceStr(MakeVec(indptr.HostPointer(), indptr.Size()));
+    auto values_arr = ArrayInterfaceStr(MakeVec(values.HostPointer(), values.Size()));
+    auto indices_arr = ArrayInterfaceStr(MakeVec(indices.HostPointer(), indices.Size()));
+    auto adapter =
+        data::CSRArrayAdapter(StringView{indptr_arr.c_str(), indptr_arr.size()},
+                              StringView{values_arr.c_str(), values_arr.size()},
+                              StringView{indices_arr.c_str(), indices_arr.size()}, n_features);
+    auto batch = adapter.Value();
+    ASSERT_EQ(batch.NumRows(), n_samples);
+    ASSERT_EQ(batch.NumCols(), n_features);
+
+    ASSERT_EQ(adapter.NumRows(), n_samples);
+    ASSERT_EQ(adapter.NumColumns(), n_features);
+  }
 }
 
 TEST(Adapter, CSCAdapterColsMoreThanRows) {
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index 1151a3dac96e..feb5161025b2 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -82,14 +82,17 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
 }
 
 TEST(SimpleDMatrix, Empty) {
-  std::vector<float> data{};
-  std::vector<unsigned> feature_idx = {};
-  std::vector<size_t> row_ptr = {};
-
-  data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(),
-                               0, 0, 0);
-  std::unique_ptr<data::SimpleDMatrix> dmat(new data::SimpleDMatrix(
-      &csr_adapter, std::numeric_limits<float>::quiet_NaN(), 1));
+  HostDeviceVector<float> data{};
+  HostDeviceVector<unsigned> feature_idx{};
+  HostDeviceVector<size_t> row_ptr{};
+
+  auto j_data = Json::Dump(GetArrayInterface(&data, 0, 1));
+  auto j_feature_idx = Json::Dump(GetArrayInterface(&feature_idx, 0, 1));
+  auto j_row_ptr = Json::Dump(GetArrayInterface(&row_ptr, 0, 1));
+
+  data::CSRArrayAdapter csr_adapter(j_row_ptr, j_feature_idx, j_data, 0);
+  std::unique_ptr<data::SimpleDMatrix> dmat(
+      new data::SimpleDMatrix(&csr_adapter, std::numeric_limits<float>::quiet_NaN(), 1));
   CHECK_EQ(dmat->Info().num_nonzero_, 0);
   CHECK_EQ(dmat->Info().num_row_, 0);
   CHECK_EQ(dmat->Info().num_col_, 0);
@@ -98,8 +101,16 @@ TEST(SimpleDMatrix, Empty) {
   }
 
   data::DenseAdapter dense_adapter(nullptr, 0, 0);
-  dmat.reset( new data::SimpleDMatrix(&dense_adapter,
-                                      std::numeric_limits<float>::quiet_NaN(), 1) );
+  dmat.reset(new data::SimpleDMatrix(&dense_adapter, std::numeric_limits<float>::quiet_NaN(), 1));
+  CHECK_EQ(dmat->Info().num_nonzero_, 0);
+  CHECK_EQ(dmat->Info().num_row_, 0);
+  CHECK_EQ(dmat->Info().num_col_, 0);
+  for (auto &batch : dmat->GetBatches<SparsePage>()) {
+    CHECK_EQ(batch.Size(), 0);
+  }
+
+  data::CSCArrayAdapter csc_adapter(j_row_ptr, j_feature_idx, j_data, 0);
+  dmat.reset(new data::SimpleDMatrix(&csc_adapter, std::numeric_limits<float>::quiet_NaN(), 1));
   CHECK_EQ(dmat->Info().num_nonzero_, 0);
   CHECK_EQ(dmat->Info().num_row_, 0);
   CHECK_EQ(dmat->Info().num_col_, 0);
@@ -109,36 +120,40 @@ TEST(SimpleDMatrix, Empty) {
 }
 
 TEST(SimpleDMatrix, MissingData) {
-  std::vector<float> data{0.0, std::nanf(""), 1.0};
-  std::vector<unsigned> feature_idx = {0, 1, 0};
-  std::vector<size_t> row_ptr = {0, 2, 3};
-
-  data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
-                           3, 2);
-  std::unique_ptr<data::SimpleDMatrix> dmat{new data::SimpleDMatrix{
-      &adapter, std::numeric_limits<float>::quiet_NaN(), 1}};
+  HostDeviceVector<float> data{0.0, std::nanf(""), 1.0};
+  HostDeviceVector<unsigned> feature_idx = {0, 1, 0};
+  HostDeviceVector<size_t> row_ptr = {0, 2, 3};
+
+  auto j_data = Json::Dump(GetArrayInterface(&data, 3, 1));
+  auto j_feature_idx = Json::Dump(GetArrayInterface(&feature_idx, 3, 1));
+  auto j_row_ptr = Json::Dump(GetArrayInterface(&row_ptr, 3, 1));
+
+  data::CSRArrayAdapter adapter{j_row_ptr, j_feature_idx, j_data, 2ul};
+  std::unique_ptr<data::SimpleDMatrix> dmat{
+      new data::SimpleDMatrix{&adapter, std::numeric_limits<float>::quiet_NaN(), 1}};
   CHECK_EQ(dmat->Info().num_nonzero_, 2);
   dmat.reset(new data::SimpleDMatrix(&adapter, 1.0, 1));
   CHECK_EQ(dmat->Info().num_nonzero_, 1);
 
   {
-    data[1] = std::numeric_limits<float>::infinity();
-    data::DenseAdapter adapter(data.data(), data.size(), 1);
-    EXPECT_THROW(data::SimpleDMatrix dmat(
-                     &adapter, std::numeric_limits<float>::quiet_NaN(), -1),
+    data.HostVector()[1] = std::numeric_limits<float>::infinity();
+    data::DenseAdapter adapter(data.ConstHostPointer(), data.Size(), 1);
+    EXPECT_THROW(data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1),
                  dmlc::Error);
   }
 }
 
 TEST(SimpleDMatrix, EmptyRow) {
-  std::vector<float> data{0.0, 1.0};
-  std::vector<unsigned> feature_idx = {0, 1};
-  std::vector<size_t> row_ptr = {0, 2, 2};
+  HostDeviceVector<float> data{0.0, 1.0};
+  HostDeviceVector<unsigned> feature_idx{0, 1};
+  HostDeviceVector<size_t> row_ptr{0, 2, 2};
 
-  data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2,
-                           2, 2);
-  data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
-                           1);
+  auto j_data = Json::Dump(GetArrayInterface(&data, 2, 1));
+  auto j_feature_idx = Json::Dump(GetArrayInterface(&feature_idx, 2, 1));
+  auto j_row_ptr = Json::Dump(GetArrayInterface(&row_ptr, 3, 1));
+
+  data::CSRArrayAdapter adapter{j_row_ptr, j_feature_idx, j_data, 2};
+  data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
   CHECK_EQ(dmat.Info().num_nonzero_, 2);
   CHECK_EQ(dmat.Info().num_row_, 2);
   CHECK_EQ(dmat.Info().num_col_, 2);
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 51726bb7228f..05242bdc8f0c 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -416,8 +416,15 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
   HostDeviceVector<std::size_t> rptrs;
   HostDeviceVector<bst_feature_t> columns;
   this->GenerateCSR(&data, &rptrs, &columns);
-  data::CSRAdapter adapter(rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), rows_,
-                           data.Size(), cols_);
+  // Initialize on CPU.
+  data.HostVector();
+  rptrs.HostVector();
+  columns.HostVector();
+  auto adapter =
+      data::CSRArrayAdapter{Json::Dump(GetArrayInterface(&rptrs, rptrs.Size(), 1)),
+                            Json::Dump(GetArrayInterface(&columns, columns.Size(), 1)),
+                            Json::Dump(GetArrayInterface(&data, data.Size(), 1)), this->cols_};
+
   std::shared_ptr<DMatrix> out{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, "", data_split_mode)};
 

From 68182c26503083a17bf06ecc530cf09c5d0dd942 Mon Sep 17 00:00:00 2001
From: Kunal Jani <168258319+kunaljani1100@users.noreply.github.com>
Date: Wed, 18 Jun 2025 15:09:13 -0700
Subject: [PATCH 071/224] Documentation for Booster.java. (#11508)

* Documentation for Booster.java.

* Revert changes not required.

* Change feature map description.

* Checkstyle fix.

* Exclude from javadoc scan.

* Remove excludePackageNames

---------

Co-authored-by: Hyunsu Cho <phcho@nvidia.com>
---
 .../java/ml/dmlc/xgboost4j/java/Booster.java  | 89 ++++++++++++-------
 1 file changed, 56 insertions(+), 33 deletions(-)

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 22ed6dc82166..9a1cd15d9aa8 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -31,7 +31,7 @@
 import org.apache.commons.logging.LogFactory;
 
 /**
- * Booster for xgboost, this is a model API that support interactive build of a XGBoost Model
+ * Booster for xgboost, this is a model API that support interactive build of an XGBoost Model
  */
 public class Booster implements Serializable, KryoSerializable {
   public static final String DEFAULT_FORMAT = "ubj";
@@ -39,7 +39,9 @@ public class Booster implements Serializable, KryoSerializable {
   // handle to the booster.
   private long handle = 0;
   private int version = 0;
+
   /**
+   * This enumeration defines the type of prediction to be made and is used for inplace predictions.
    * Type of prediction, used for inplace_predict.
    */
   public enum PredictionType {
@@ -58,9 +60,9 @@ public Integer getPType() {
   /**
    * Create a new Booster with empty stage.
    *
-   * @param params  Model parameters
-   * @param cacheMats Cached DMatrix entries,
-   *                  the prediction of these DMatrices will become faster than not-cached data.
+   * @param params  Model parameters that are used to build the Booster
+   * @param cacheMats Cached DMatrix entries that help increase the speed of Booster prediction
+   *
    * @throws XGBoostError native error
    */
   Booster(Map<String, Object> params, DMatrix[] cacheMats) throws XGBoostError {
@@ -70,7 +72,7 @@ public Integer getPType() {
 
   /**
    * Load a new Booster model from modelPath
-   * @param modelPath The path to the model.
+   * @param modelPath model path
    * @return The created Booster.
    * @throws XGBoostError
    */
@@ -89,7 +91,7 @@ static Booster loadModel(String modelPath) throws XGBoostError {
    * This can be used to load existing booster models saved by other xgboost bindings.
    *
    * @param buffer The byte contents of the booster.
-   * @return The created boosted
+   * @return The created booster.
    * @throws XGBoostError
    */
   static Booster loadModel(byte[] buffer) throws XGBoostError {
@@ -140,7 +142,7 @@ public final Map<String, String> getAttrs() throws XGBoostError {
   }
 
   /**
-   * Get attribute from the Booster.
+   * Get attribute value from the Booster based on the key provided.
    *
    * @param key   attribute key
    * @return attribute value
@@ -153,7 +155,7 @@ public final String getAttr(String key) throws XGBoostError {
   }
 
   /**
-   * Set attribute to the Booster.
+   * Set an attribute key-value pair to the Booster.
    *
    * @param key   attribute key
    * @param value attribute value
@@ -164,7 +166,7 @@ public final void setAttr(String key, String value) throws XGBoostError {
   }
 
   /**
-   * Set attributes to the Booster.
+   * Set multiple attribute key-value pairs to the Booster.
    *
    * @param attrs attributes key-value map
    * @throws XGBoostError native error
@@ -178,8 +180,8 @@ public void setAttrs(Map<String, String> attrs) throws XGBoostError {
   }
 
   /**
-   * Get feature names from the Booster.
-   * @return
+   * Get all the feature names from the Booster.
+   * @return An array of all the feature names.
    * @throws XGBoostError
    */
   public final String[] getFeatureNames() throws XGBoostError {
@@ -192,7 +194,7 @@ public final String[] getFeatureNames() throws XGBoostError {
   /**
    * Set feature names to the Booster.
    *
-   * @param featureNames
+   * @param An array of all the feature names.
    * @throws XGBoostError
    */
   public void setFeatureNames(String[] featureNames) throws XGBoostError {
@@ -202,7 +204,7 @@ public void setFeatureNames(String[] featureNames) throws XGBoostError {
 
   /**
    * Get feature types from the Booster.
-   * @return
+   * @return An array of all the feature types.
    * @throws XGBoostError
    */
   public final String[] getFeatureTypes() throws XGBoostError {
@@ -214,7 +216,7 @@ public final String[] getFeatureTypes() throws XGBoostError {
 
   /**
    * Set feature types to the Booster.
-   * @param featureTypes
+   * @param An array of all the feature types.
    * @throws XGBoostError
    */
   public void setFeatureTypes(String[] featureTypes) throws XGBoostError {
@@ -241,7 +243,7 @@ public void update(DMatrix dtrain, IObjective obj) throws XGBoostError {
   }
 
   /**
-   * Update with customize obj func
+   * Update with customize object functon
    *
    * @param dtrain training data
    * @param iter   The current training iteration.
@@ -278,12 +280,12 @@ public void boost(DMatrix dtrain, int iter, float[] grad, float[] hess) throws X
   }
 
   /**
-   * evaluate with given dmatrixs.
+   * Evaluate the Booster model with given dmatrixs.
    *
    * @param evalMatrixs dmatrixs for evaluation
    * @param evalNames   name for eval dmatrixs, used for check results
    * @param iter        current eval iteration
-   * @return eval information
+   * @return eval Information containing the evaluation results
    * @throws XGBoostError native error
    */
   public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, int iter) throws XGBoostError {
@@ -295,13 +297,13 @@ public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, int iter) throw
   }
 
   /**
-   * evaluate with given dmatrixs.
+   * Evaluate the Booster model with given dmatrixs.
    *
    * @param evalMatrixs dmatrixs for evaluation
    * @param evalNames   name for eval dmatrixs, used for check results
    * @param iter        current eval iteration
    * @param metricsOut  output array containing the evaluation metrics for each evalMatrix
-   * @return eval information
+   * @return eval Information containing the evaluation results
    * @throws XGBoostError native error
    */
   public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, int iter, float[] metricsOut)
@@ -322,12 +324,12 @@ public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, int iter, float
   }
 
   /**
-   * evaluate with given customized Evaluation class
+   * Evaluate the Booster model given customized Evaluation class
    *
    * @param evalMatrixs evaluation matrix
    * @param evalNames   evaluation names
    * @param eval        custom evaluator
-   * @return eval information
+   * @return eval Information containing the evaluation results
    * @throws XGBoostError native error
    */
   public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, IEvaluation eval)
@@ -351,14 +353,14 @@ public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, IEvaluation eva
   }
 
   /**
-   * Advanced predict function with all the options.
+   * An advanced prediction function with all the options.
    *
-   * @param data         data
+   * @param data         the test data for which prodictions are to be made
    * @param outputMargin output margin
    * @param treeLimit    limit number of trees, 0 means all trees.
    * @param predLeaf     prediction minimum to keep leafs
    * @param predContribs prediction feature contributions
-   * @return predict results
+   * @return predict two dimensional array of results, where each row corresponds to a prediction.
    */
   private synchronized float[][] predict(DMatrix data,
                                          boolean outputMargin,
@@ -506,10 +508,10 @@ public float[][] predictContrib(DMatrix data, int treeLimit) throws XGBoostError
   }
 
   /**
-   * Predict with data
+   * Make a prediction with test data in a DMatrix format.
    *
-   * @param data dmatrix storing the input
-   * @return predict result
+   * @param data dmatrix storing the test input on which predictions are to be made
+   * @return predict The results of the prediction, where each row corresponds to a prediction.
    * @throws XGBoostError native error
    */
   public float[][] predict(DMatrix data) throws XGBoostError {
@@ -517,11 +519,11 @@ public float[][] predict(DMatrix data) throws XGBoostError {
   }
 
   /**
-   * Predict with data
+   * Make a prediction with test data in a DMatrix format and output margin.
    *
-   * @param data  data
+   * @param data  dmatrix storing the test input on which predictions are to be made
    * @param outputMargin output margin
-   * @return predict results
+   * @return predict The results of the prediction, where each row corresponds to a prediction.
    */
   public float[][] predict(DMatrix data, boolean outputMargin) throws XGBoostError {
     return this.predict(data, outputMargin, 0, false, false);
@@ -530,10 +532,10 @@ public float[][] predict(DMatrix data, boolean outputMargin) throws XGBoostError
   /**
    * Advanced predict function with all the options.
    *
-   * @param data         data
+   * @param data         matrix storing the test input on which predictions are to be made
    * @param outputMargin output margin
    * @param treeLimit    limit number of trees, 0 means all trees.
-   * @return predict results
+   * @return predict The results of the prediction, where each row corresponds to a prediction.
    */
   public float[][] predict(DMatrix data, boolean outputMargin, int treeLimit) throws XGBoostError {
     return this.predict(data, outputMargin, treeLimit, false, false);
@@ -579,14 +581,25 @@ public void saveModel(OutputStream out, String format) throws XGBoostError, IOEx
   /**
    * Get the dump of the model as a string array
    *
+   * @param featureMap A string containing the path to a feature map.
    * @param withStats Controls whether the split statistics are output.
-   * @return dumped model information
+   * @return The dumped model information
    * @throws XGBoostError native error
    */
   public String[] getModelDump(String featureMap, boolean withStats) throws XGBoostError {
     return getModelDump(featureMap, withStats, "text");
   }
 
+  /**
+   * Get the dump of the model as a string array with specified feature map, stats,
+   * and the specified format.
+   *
+   * @param featureMap A string containing the path to a feature map.
+   * @param withStats Controls whether the split statistics are output.
+   * @param format The format in which the model is dumped (text, json, ubj).
+   * @return The dumped model information
+   * @throws XGBoostError
+   */
   public String[] getModelDump(String featureMap, boolean withStats, String format)
          throws XGBoostError {
     int statsFlag = 0;
@@ -616,6 +629,16 @@ public String[] getModelDump(String[] featureNames, boolean withStats) throws XG
     return getModelDump(featureNames, withStats, "text");
   }
 
+  /**
+   * Get the dump of the model as a string array with specified feature map, stats,
+   * and the specified format.
+   *
+   * @param featureMap An array of strings containing the feature names.
+   * @param withStats Controls whether the split statistics are output.
+   * @param format The format in which the model is dumped (text, json, ubj).
+   * @return The dumped model information
+   * @throws XGBoostError
+   */
   public String[] getModelDump(String[] featureNames, boolean withStats, String format)
       throws XGBoostError {
     int statsFlag = 0;

From 25045f56d6118f890e99b8672f56ee28eaabc594 Mon Sep 17 00:00:00 2001
From: Jan Borchmann <jan.borchmann@shopify.com>
Date: Tue, 24 Jun 2025 01:07:47 -0400
Subject: [PATCH 072/224] [PySpark] Fix for none parameter column name issue
 (#11523)

---
 python-package/xgboost/spark/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 3db9b1409351..1ef59294056d 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -510,7 +510,7 @@ def _run_on_gpu(self) -> bool:
         )
 
     def _col_is_defined_not_empty(self, param: "Param[str]") -> bool:
-        return self.isDefined(param) and self.getOrDefault(param) != ""
+        return self.isDefined(param) and self.getOrDefault(param) not in (None, "")
 
 
 def _validate_and_convert_feature_col_as_float_col_list(

From 177eb7b8cb79d6f2067883f9ccd358d5d643e5e9 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Tue, 24 Jun 2025 17:35:38 +0200
Subject: [PATCH 073/224] [sycl] Fix trainig continuation with fp32 devices
 (#11524)

Co-authored-by: Dmitry Razdoburdin <>
---
 plugin/sycl/predictor/predictor.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index 43356f64eb0b..bde2d96bd8e8 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -291,7 +291,7 @@ class Predictor : public xgboost::Predictor {
         }
 
         if (num_group == 1) {
-          float sum = 0.0;
+          float& sum = out_predictions[row_idx];
           for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
             const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
             if constexpr (any_missing) {
@@ -300,7 +300,6 @@ class Predictor : public xgboost::Predictor {
               sum += GetLeafWeight(first_node, fval_buff_row_ptr);
             }
           }
-          out_predictions[row_idx] += sum;
         } else {
           for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
             const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];

From c13af05e895434bc22ed4a142a313a1a93e18616 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Wed, 25 Jun 2025 16:04:31 +0200
Subject: [PATCH 074/224] [sycl] refactor gradient_index (#11528)

Co-authored-by: Dmitry Razdoburdin <>
---
 plugin/sycl/data/gradient_index.cc            | 71 +++++++++----------
 plugin/sycl/data/gradient_index.h             | 49 +++----------
 .../cpp/plugin/test_sycl_partition_builder.cc |  2 +-
 3 files changed, 42 insertions(+), 80 deletions(-)

diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
index e6182e07b976..d4591803cde2 100644
--- a/plugin/sycl/data/gradient_index.cc
+++ b/plugin/sycl/data/gradient_index.cc
@@ -50,10 +50,9 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
 
 template <typename BinIdxType, bool isDense>
 void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
+                                    Context const * ctx,
                                     BinIdxType* index_data,
-                                    DMatrix *dmat,
-                                    size_t nbins,
-                                    size_t row_stride) {
+                                    DMatrix *dmat) {
   if (nbins == 0) return;
   const bst_float* cut_values = cut.cut_values_.ConstDevicePointer();
   const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer();
@@ -61,17 +60,16 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
 
   BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
 
-  ::sycl::event event;
   for (auto &batch : dmat->GetBatches<SparsePage>()) {
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
-      const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
-      size_t batch_size = batch.Size();
-      if (batch_size > 0) {
-        const auto base_rowid = batch.base_rowid;
-        event = qu->submit([&](::sycl::handler& cgh) {
-          cgh.depends_on(event);
-          cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
+    const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
+    const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
+    size_t batch_size = batch.Size();
+    if (batch_size > 0) {
+      const auto base_rowid = batch.base_rowid;
+      size_t row_stride = this->row_stride;
+      size_t nbins = this->nbins;
+      qu->submit([&](::sycl::handler& cgh) {
+        cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
           const size_t i = pid.get_id(0);
           const size_t ibegin = offset_vec[i];
           const size_t iend = offset_vec[i + 1];
@@ -92,23 +90,22 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
           }
         });
       });
-      }
     }
   }
   qu->wait();
 }
 
-void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) {
+  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense_) {
     index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
-    index.Resize((sizeof(uint8_t)) * n_index);
+    index.Resize(qu, (sizeof(uint8_t)) * n_index);
   } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
-    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
+    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense_) {
     index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
-    index.Resize((sizeof(uint16_t)) * n_index);
+    index.Resize(qu, (sizeof(uint16_t)) * n_index);
   } else {
     index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
-    index.Resize((sizeof(uint32_t)) * n_index);
+    index.Resize(qu, (sizeof(uint32_t)) * n_index);
   }
 }
 
@@ -122,52 +119,50 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
   cut.SetDevice(ctx->Device());
 
   max_num_bins = max_bins;
-  const uint32_t nbins = cut.Ptrs().back();
-  this->nbins = nbins;
+  nbins = cut.Ptrs().back();
 
   hit_count.SetDevice(ctx->Device());
   hit_count.Resize(nbins, 0);
 
-  this->p_fmat = dmat;
   const bool isDense = dmat->IsDense();
   this->isDense_ = isDense;
 
-  index.setQueue(qu);
-
   row_stride = 0;
   size_t n_rows = 0;
-  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    const auto& row_offset = batch.offset.ConstHostVector();
-    batch.data.SetDevice(ctx->Device());
-    batch.offset.SetDevice(ctx->Device());
-    n_rows += batch.Size();
-    for (auto i = 1ull; i < row_offset.size(); i++) {
-      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+  if (!isDense) {
+    for (const auto& batch : dmat->GetBatches<SparsePage>()) {
+      const auto& row_offset = batch.offset.ConstHostVector();
+      n_rows += batch.Size();
+      for (auto i = 1ull; i < row_offset.size(); i++) {
+        row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+      }
     }
+  } else {
+    row_stride = nfeatures;
+    n_rows = dmat->Info().num_row_;
   }
-
   const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
   const size_t n_index = n_rows * row_stride;
-  ResizeIndex(n_index, isDense);
+  ResizeIndex(qu, n_index);
 
   CHECK_GT(cut.cut_values_.Size(), 0U);
 
   if (isDense) {
     BinTypeSize curent_bin_size = index.GetBinTypeSize();
     if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
-      SetIndexData<uint8_t, true>(qu, index.data<uint8_t>(), dmat, nbins, row_stride);
+      SetIndexData<uint8_t, true>(qu, ctx, index.data<uint8_t>(), dmat);
 
     } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
-      SetIndexData<uint16_t, true>(qu, index.data<uint16_t>(), dmat, nbins, row_stride);
+      SetIndexData<uint16_t, true>(qu, ctx, index.data<uint16_t>(), dmat);
     } else {
       CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
-      SetIndexData<uint32_t, true>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
+      SetIndexData<uint32_t, true>(qu, ctx, index.data<uint32_t>(), dmat);
     }
   /* For sparse DMatrix we have to store index of feature for each bin
      in index field to chose right offset. So offset is nullptr and index is not reduced */
   } else {
     sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t));
-    SetIndexData<uint32_t, false>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
+    SetIndexData<uint32_t, false>(qu, ctx, index.data<uint32_t>(), dmat);
   }
 }
 
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
index b88f2a8015ce..46e56f70ad73 100644
--- a/plugin/sycl/data/gradient_index.h
+++ b/plugin/sycl/data/gradient_index.h
@@ -31,21 +31,9 @@ struct Index {
   Index& operator=(Index&& i) = delete;
   void SetBinTypeSize(BinTypeSize binTypeSize) {
     binTypeSize_ = binTypeSize;
-    switch (binTypeSize) {
-      case BinTypeSize::kUint8BinsTypeSize:
-        func_ = &GetValueFromUint8;
-        break;
-      case BinTypeSize::kUint16BinsTypeSize:
-        func_ = &GetValueFromUint16;
-        break;
-      case BinTypeSize::kUint32BinsTypeSize:
-        func_ = &GetValueFromUint32;
-        break;
-      default:
-        CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
-              binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
-              binTypeSize == BinTypeSize::kUint32BinsTypeSize);
-    }
+    CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
+          binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
+          binTypeSize == BinTypeSize::kUint32BinsTypeSize);
   }
   BinTypeSize GetBinTypeSize() const {
     return binTypeSize_;
@@ -65,8 +53,8 @@ struct Index {
     return data_.Size() / (binTypeSize_);
   }
 
-  void Resize(const size_t nBytesData) {
-    data_.Resize(qu_, nBytesData);
+  void Resize(::sycl::queue* qu, const size_t nBytesData) {
+    data_.Resize(qu, nBytesData);
   }
 
   uint8_t* begin() const {
@@ -77,28 +65,9 @@ struct Index {
     return data_.End();
   }
 
-  void setQueue(::sycl::queue* qu) {
-    qu_ = qu;
-  }
-
  private:
-  static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
-    return reinterpret_cast<const uint8_t*>(t)[i];
-  }
-  static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
-    return reinterpret_cast<const uint16_t*>(t)[i];
-  }
-  static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
-    return reinterpret_cast<const uint32_t*>(t)[i];
-  }
-
-  using Func = uint32_t (*)(const uint8_t*, size_t);
-
   USMVector<uint8_t, MemoryType::on_device> data_;
   BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
-  Func func_;
-
-  ::sycl::queue* qu_;
 };
 
 /*!
@@ -116,7 +85,6 @@ struct GHistIndexMatrix {
   USMVector<uint8_t, MemoryType::on_device> sort_buff;
   /*! \brief The corresponding cuts */
   xgboost::common::HistogramCuts cut;
-  DMatrix* p_fmat;
   size_t max_num_bins;
   size_t nbins;
   size_t nfeatures;
@@ -127,11 +95,10 @@ struct GHistIndexMatrix {
             DMatrix *dmat, int max_num_bins);
 
   template <typename BinIdxType, bool isDense>
-  void SetIndexData(::sycl::queue* qu, BinIdxType* index_data,
-                    DMatrix *dmat,
-                    size_t nbins, size_t row_stride);
+  void SetIndexData(::sycl::queue* qu, Context const * ctx, BinIdxType* index_data,
+                    DMatrix *dmat);
 
-  void ResizeIndex(size_t n_index, bool isDense);
+  void ResizeIndex(::sycl::queue* qu, size_t n_index);
 
   inline void GetFeatureCounts(size_t* counts) const {
     auto nfeature = cut.cut_ptrs_.Size() - 1;
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
index 5928988c6441..584b5c26fb72 100644
--- a/tests/cpp/plugin/test_sycl_partition_builder.cc
+++ b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -67,7 +67,7 @@ void TestPartitioning(float sparsity, int max_bins) {
 
   std::vector<uint8_t> ridx_left(num_rows, 0);
   std::vector<uint8_t> ridx_right(num_rows, 0);
-  for (auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
+  for (auto &batch : p_fmat->GetBatches<SparsePage>()) {
     const auto& data_vec = batch.data.HostVector();
     const auto& offset_vec = batch.offset.HostVector();
 

From e4cf319c8e4c685177638b41e937b03476f083da Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 26 Jun 2025 10:13:23 +0800
Subject: [PATCH 075/224] [EM] Initial support for using snappy with sparse
 data. (#11520)

---
 include/xgboost/data.h                        |  11 +-
 src/c_api/c_api.cu                            |  15 +-
 src/common/cuda_dr_utils.h                    |   2 +-
 src/common/cuda_pinned_allocator.cu           |   1 +
 src/common/device_compression.cu              |  26 ++-
 src/common/device_compression.cuh             |  15 +-
 src/common/io.cc                              |  19 ++
 src/common/io.h                               |   2 +
 src/data/ellpack_page_raw_format.cu           |   4 +-
 src/data/ellpack_page_source.cu               | 176 +++++++++++++-----
 src/data/ellpack_page_source.h                |  52 ++++--
 src/data/extmem_quantile_dmatrix.cu           |   2 +-
 tests/cpp/common/test_io.cc                   |   2 +-
 .../cpp/data/test_extmem_quantile_dmatrix.cu  |  73 ++++++++
 tests/cpp/helpers.cc                          |  48 +++--
 tests/cpp/helpers.h                           |   5 +
 16 files changed, 359 insertions(+), 94 deletions(-)

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index bc483a3949a7..7fbee777afba 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -291,7 +291,6 @@ struct BatchParam {
    * @brief The number of batches to pre-fetch for external memory.
    */
   std::int32_t n_prefetch_batches{3};
-
   /**
    * @brief Exact or others that don't need histogram.
    */
@@ -542,6 +541,10 @@ struct ExtMemConfig {
   float missing;
   // The number of CPU threads.
   std::int32_t n_threads{0};
+  // The ratio of the cache that can be compressed. Used for testing.
+  float hw_decomp_ratio{std::numeric_limits<float>::quiet_NaN()};
+  // Fallback to using nvcomp. Used for testing.
+  bool allow_decomp_fallback{false};
 
   ExtMemConfig() = delete;
   ExtMemConfig(std::string cache, bool on_host, float h_ratio, std::int64_t min_cache,
@@ -552,6 +555,12 @@ struct ExtMemConfig {
         min_cache_page_bytes{min_cache},
         missing{missing},
         n_threads{n_threads} {}
+
+  ExtMemConfig& SetParamsForTest(float _hw_decomp_ratio, bool _allow_decomp_fallback) {
+    this->hw_decomp_ratio = _hw_decomp_ratio;
+    this->allow_decomp_fallback = _allow_decomp_fallback;
+    return *this;
+  }
 };
 
 /**
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index c9ff16dea120..712f1e2b9a2e 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,11 +1,10 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <thrust/transform.h>  // for transform
 
 #include "../common/api_entry.h"       // for XGBAPIThreadLocalEntry
 #include "../common/cuda_context.cuh"  // for CUDAContext
-#include "../common/threading_utils.h"
 #include "../data/array_interface.h"  // for DispatchDType, ArrayInterface
 #include "../data/device_adapter.cuh"
 #include "../data/proxy_dmatrix.h"
@@ -18,6 +17,9 @@
 #if defined(XGBOOST_USE_NCCL)
 #include <nccl.h>
 #endif
+#if defined(XGBOOST_USE_NVCOMP)
+#include <nvcomp/version.h>
+#endif  // defined(XGBOOST_USE_NVCOMP)
 
 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
@@ -56,6 +58,15 @@ void XGBBuildInfoDevice(Json *p_info) {
 #else
   info["USE_RMM"] = Boolean{false};
 #endif
+
+#if defined(XGBOOST_USE_NVCOMP)
+  info["USE_NVCOMP"] = Boolean{true};
+  v = {Json{Integer{NVCOMP_VER_MAJOR}}, Json{Integer{NVCOMP_VER_MINOR}},
+       Json{Integer{NVCOMP_VER_PATCH}}};
+  info["NVCOMP_VERSION"] = v;
+#else
+  info["USE_NVCOMP"] = Boolean{false};
+#endif
 }
 
 void XGBoostAPIGuard::SetGPUAttribute() {
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
index 9b9c1da5cd21..344befa27620 100644
--- a/src/common/cuda_dr_utils.h
+++ b/src/common/cuda_dr_utils.h
@@ -16,7 +16,7 @@
 
 #include "xgboost/string_view.h"  // for StringView
 
-#if CUDART_VERSION >= 12080
+#if CUDART_VERSION >= 12080 && defined(__linux__)
 #define CUDA_HW_DECOM_AVAILABLE 1
 #endif
 
diff --git a/src/common/cuda_pinned_allocator.cu b/src/common/cuda_pinned_allocator.cu
index 76ae204dd74f..21ece34090c7 100644
--- a/src/common/cuda_pinned_allocator.cu
+++ b/src/common/cuda_pinned_allocator.cu
@@ -63,6 +63,7 @@ namespace xgboost::common::cuda_impl {
       [](cudaMemPool_t* mem_pool) {
         if (mem_pool) {
           dh::safe_cuda(cudaMemPoolDestroy(*mem_pool));
+          delete mem_pool;
         }
       }};
   return mem_pool;
diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu
index 80dc23c2c9ba..15d04a24d7e1 100644
--- a/src/common/device_compression.cu
+++ b/src/common/device_compression.cu
@@ -25,9 +25,11 @@
 #include <mutex>      // for once_flag, call_once
 #include <vector>     // for vector
 
+#include "common.h"               // for HumanMemUnit
 #include "compressed_iterator.h"  // for CompressedByteT
 #include "cuda_context.cuh"       // for CUDAContext
 #include "cuda_dr_utils.h"        // for GetGlobalCuDriverApi
+#include "cuda_rt_utils.h"        // for CurrentDevice
 #include "device_compression.h"
 #include "device_vector.cuh"      // for DeviceUVector
 #include "nvtx_utils.h"           // for xgboost_NVTX_FN_RANGE
@@ -290,7 +292,6 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
     dh::device_vector<void*> d_out_ptrs(n_chunks);
     dh::safe_cuda(cudaMemcpyAsync(d_out_ptrs.data().get(), h_out_ptrs.data(),
                                   dh::ToSpan(d_out_ptrs).size_bytes(), cudaMemcpyDefault, stream));
-    CHECK(curt::SupportsPageableMem() || curt::SupportsAts());
     // Run nvcomp
     SafeNvComp(nvcompBatchedSnappyDecompressAsync(
         mgr_impl->d_in_chunk_ptrs.data().get(), mgr_impl->d_in_chunk_sizes.data().get(),
@@ -383,8 +384,11 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
   auto n_bytes = thrust::reduce(cuctx->CTP(), out_sizes.cbegin(), out_sizes.cend());
   auto n_total_bytes = p_out->size();
   auto ratio = static_cast<double>(n_total_bytes) / in.size_bytes();
-  LOG(DEBUG) << "[snappy] Input: " << in.size_bytes() << ", need:" << n_bytes
-             << " allocated:" << n_total_bytes << " ratio:" << ratio;
+  auto ratio_act = static_cast<double>(n_bytes) / in.size_bytes();
+  LOG(DEBUG) << "[snappy] Input: " << common::HumanMemUnit(in.size_bytes())
+             << ", need:" << common::HumanMemUnit(n_bytes)
+             << ", allocated:" << common::HumanMemUnit(n_total_bytes) << ", ratio:" << ratio
+             << ", actual ratio:" << ratio_act;
 
   /**
    * Meta
@@ -470,10 +474,7 @@ SnappyDecomprMgr::~SnappyDecomprMgr() = default;
 SnappyDecomprMgrImpl* SnappyDecomprMgr::Impl() const { return nullptr; }
 
 [[nodiscard]] bool SnappyDecomprMgr::Empty() const { return true; }
-[[nodiscard]] std::size_t SnappyDecomprMgr::DecompressedBytes() const {
-  common::AssertNvCompSupport();
-  return 0;
-}
+[[nodiscard]] std::size_t SnappyDecomprMgr::DecompressedBytes() const { return 0; }
 
 // Round-trip compression
 void DecompressSnappy(dh::CUDAStreamView, SnappyDecomprMgr const&,
@@ -482,15 +483,22 @@ void DecompressSnappy(dh::CUDAStreamView, SnappyDecomprMgr const&,
 }
 
 [[nodiscard]] CuMemParams CompressSnappy(Context const*,
-                                         common::Span<common::CompressedByteT const>,
+                                         common::Span<common::CompressedByteT const> in,
                                          dh::DeviceUVector<std::uint8_t>*, std::size_t) {
+  if (in.empty()) {
+    return {};
+  }
   common::AssertNvCompSupport();
   return {};
 }
 
 [[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
-    dh::CUDAStreamView, std::shared_ptr<HostPinnedMemPool>, CuMemParams const&,
+    dh::CUDAStreamView, std::shared_ptr<HostPinnedMemPool>, CuMemParams const& in_params,
     dh::DeviceUVector<std::uint8_t> const&, CuMemParams*) {
+  std::size_t n_total_bytes = in_params.TotalSrcBytes();
+  if (n_total_bytes == 0) {
+    return {};
+  }
   common::AssertNvCompSupport();
   return {};
 }
diff --git a/src/common/device_compression.cuh b/src/common/device_compression.cuh
index 626e1b26d03a..b1e9dbc0312f 100644
--- a/src/common/device_compression.cuh
+++ b/src/common/device_compression.cuh
@@ -104,11 +104,20 @@ struct SnappyDecomprMgrImpl {
   [[nodiscard]] bool Empty() const;
 };
 
-inline auto MakeSnappyDecomprMgr(dh::CUDAStreamView s, std::shared_ptr<HostPinnedMemPool> pool,
-                                 CuMemParams params,
-                                 common::Span<std::uint8_t const> in_compressed_data) {
+#if defined(XGBOOST_USE_NVCOMP)
+[[nodiscard]] inline auto MakeSnappyDecomprMgr(
+    dh::CUDAStreamView s, std::shared_ptr<HostPinnedMemPool> pool, CuMemParams params,
+    common::Span<std::uint8_t const> in_compressed_data) {
   SnappyDecomprMgr mgr;
   *mgr.Impl() = SnappyDecomprMgrImpl{s, std::move(pool), std::move(params), in_compressed_data};
   return mgr;
 }
+#else
+[[nodiscard]] inline auto MakeSnappyDecomprMgr(dh::CUDAStreamView,
+                                               std::shared_ptr<HostPinnedMemPool>, CuMemParams,
+                                               common::Span<std::uint8_t const>) {
+  SnappyDecomprMgr mgr;
+  return mgr;
+}
+#endif  // defined(XGBOOST_USE_NVCOMP)
 }  // namespace xgboost::dc
diff --git a/src/common/io.cc b/src/common/io.cc
index a01c31e629dc..3808098255f6 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -42,6 +42,10 @@
 #include <limits>  // for numeric_limits
 #endif
 
+#if defined(__linux__)
+#include <sys/sysinfo.h>
+#endif
+
 namespace xgboost::common {
 size_t PeekableInStream::Read(void* dptr, size_t size) {
   size_t nbuffer = buffer_.length() - buffer_ptr_;
@@ -350,4 +354,19 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
   }
   return result;
 }
+
+[[nodiscard]] std::size_t TotalMemory() {
+#if defined(__linux__)
+  struct sysinfo info;
+  CHECK_EQ(sysinfo(&info), 0) << SystemErrorMsg();
+  return info.totalram * info.mem_unit;
+#elif defined(xgboost_IS_WIN)
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  CHECK(GlobalMemoryStatusEx(&status)) << SystemErrorMsg();
+  return static_cast<std::size_t>(status.ullTotalPhys);
+#else
+  LOG(FATAL) << "Not implemented";
+#endif  // defined(__linux__)
+}
 }  // namespace xgboost::common
diff --git a/src/common/io.h b/src/common/io.h
index 6fdbf643a1ca..5169743f65ed 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -633,5 +633,7 @@ class AlignedMemWriteStream : public AlignedFileWriteStream {
 
 // Run a system command, get its stdout.
 [[nodiscard]] std::string CmdOutput(StringView cmd);
+
+[[nodiscard]] std::size_t TotalMemory();
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_IO_H_
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 09e45eac9321..8739daba171a 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -102,8 +102,10 @@ template <typename T>
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
 
+  auto ctx = Context{}.MakeCUDA(curt::CurrentDevice());
+
   auto dispatch = [&] {
-    fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
+    fi->Read(&ctx, page, this->param_.prefetch_copy || !this->has_hmm_ats_);
     impl->SetCuts(this->cuts_);
   };
 
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 4cd97154b118..023e7db79c39 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -8,21 +8,37 @@
 #include <numeric>    // for accumulate
 #include <utility>    // for move
 
-#include "../common/common.h"               // for HumanMemUnit, safe_cuda
-#include "../common/cuda_rt_utils.h"        // for SetDevice
-#include "../common/cuda_stream_pool.cuh"   // for StreamPool
-#include "../common/device_helpers.cuh"     // for CUDAStreamView, DefaultStream
-#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
-#include "../common/resource.cuh"           // for PrivateCudaMmapConstStream
-#include "../common/transform_iterator.h"   // for MakeIndexTransformIter
-#include "batch_utils.h"                    // for HostRatioIsAuto
-#include "ellpack_page.cuh"                 // for EllpackPageImpl
-#include "ellpack_page.h"                   // for EllpackPage
+#include "../common/common.h"                // for HumanMemUnit, safe_cuda
+#include "../common/cuda_dr_utils.h"         // for CUDA_HW_DECOM_AVAILABLE
+#include "../common/cuda_rt_utils.h"         // for SetDevice
+#include "../common/cuda_stream_pool.cuh"    // for StreamPool
+#include "../common/device_compression.cuh"  // for CompressSnappy, MakeSnappyDecomprMgr
+#include "../common/device_helpers.cuh"      // for CUDAStreamView, DefaultStream
+#include "../common/ref_resource_view.cuh"   // for MakeFixedVecWithCudaMalloc
+#include "../common/resource.cuh"            // for PrivateCudaMmapConstStream
+#include "../common/transform_iterator.h"    // for MakeIndexTransformIter
+#include "batch_utils.h"                     // for HostRatioIsAuto
+#include "ellpack_page.cuh"                  // for EllpackPageImpl
+#include "ellpack_page.h"                    // for EllpackPage
 #include "ellpack_page_source.h"
 #include "proxy_dmatrix.cuh"  // for Dispatch
 #include "xgboost/base.h"     // for bst_idx_t
 
 namespace xgboost::data {
+namespace {
+// Can we use hardware decompression?
+[[nodiscard]] bool CanUseHwDecomp(EllpackPageImpl const* page, bool allow_fallback) {
+#if defined(CUDA_HW_DECOM_AVAILABLE) && defined(XGBOOST_USE_NVCOMP)
+  // We use it only for sparse pages.
+  return !page->IsDenseCompressed() && (dc::GetGlobalDeStatus().avail || allow_fallback);
+#else
+  (void)allow_fallback;
+  (void)page;
+  return false;
+#endif
+}
+}  // namespace
+
 /**
  * Cache
  */
@@ -31,7 +47,16 @@ EllpackMemCache::EllpackMemCache(EllpackCacheInfo cinfo, std::int32_t n_workers)
       buffer_bytes{std::move(cinfo.buffer_bytes)},
       buffer_rows{std::move(cinfo.buffer_rows)},
       cache_host_ratio{cinfo.cache_host_ratio},
-      streams{std::make_unique<curt::StreamPool>(n_workers)} {
+      hw_decomp_ratio{cinfo.hw_decomp_ratio},
+      allow_decomp_fallback{cinfo.allow_decomp_fallback},
+      streams{std::make_unique<curt::StreamPool>(n_workers)},
+      pool{[] {
+#if defined(__linux__)
+        return std::make_shared<dc::HostPinnedMemPool>();
+#else
+        return std::shared_ptr<dc::HostPinnedMemPool>{nullptr};
+#endif
+      }()} {
   CHECK_EQ(buffer_bytes.size(), buffer_rows.size());
   CHECK(!detail::HostRatioIsAuto(this->cache_host_ratio));
   CHECK_GE(this->cache_host_ratio, 0.0) << error::CacheHostRatioInvalid();
@@ -54,11 +79,13 @@ EllpackMemCache::~EllpackMemCache() = default;
 }
 
 [[nodiscard]] std::size_t EllpackMemCache::SizeBytes(std::size_t i) const noexcept(true) {
-  return this->h_pages.at(i)->MemCostBytes() + this->d_pages.at(i).size_bytes();
+  return this->h_pages.at(i)->MemCostBytes() + this->d_pages.at(i).size_bytes() +
+         this->c_pages.at(i).first.DecompressedBytes();
 }
 
 [[nodiscard]] std::size_t EllpackMemCache::GidxSizeBytes(std::size_t i) const noexcept(true) {
-  return this->h_pages.at(i)->gidx_buffer.size_bytes() + this->d_pages.at(i).size_bytes();
+  return this->h_pages.at(i)->gidx_buffer.size_bytes() + this->d_pages.at(i).size_bytes() +
+         this->c_pages.at(i).first.DecompressedBytes();
 }
 
 [[nodiscard]] std::size_t EllpackMemCache::GidxSizeBytes() const noexcept(true) {
@@ -70,13 +97,15 @@ EllpackMemCache::~EllpackMemCache() = default;
 [[nodiscard]] EllpackMemCache::PagePtr EllpackMemCache::At(std::int32_t k) const {
   auto const* h_ptr = this->h_pages.at(k).get();
   auto const* d_ptr = &this->d_pages.at(k);
-  return std::make_pair(h_ptr, d_ptr);
+  auto const* c_ptr = &this->c_pages.at(k);
+  return std::make_tuple(h_ptr, d_ptr, c_ptr);
 }
 
 [[nodiscard]] EllpackMemCache::PageRef EllpackMemCache::Back() {
   auto& h_ref = this->h_pages.back();
   auto& d_ref = this->d_pages.back();
-  return {h_ref, d_ref};
+  auto& c_ref = this->c_pages.back();
+  return {h_ref, d_ref, c_ref};
 }
 
 /**
@@ -144,23 +173,52 @@ class EllpackHostCacheStreamImpl {
                    std::size_t{1});
       return n_bytes;
     };
+
     // Finish writing a (concatenated) cache page.
-    auto commit_page = [cache_host_ratio, get_host_nbytes](EllpackPageImpl const* old_impl) {
+    auto commit_page = [&](EllpackPageImpl const* old_impl) {
       CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc);
       auto new_impl = std::make_unique<EllpackPageImpl>();
       new_impl->CopyInfo(old_impl);
-      // Split the cache into host cache and device cache
 
-      // Host cache
+      // Split the cache into host cache, compressed host cache, and the device cache. We
+      // use the decompression engin only for sparse data.
       auto n_bytes = get_host_nbytes(old_impl);
       CHECK_LE(n_bytes, old_impl->gidx_buffer.size_bytes());
+      std::size_t n_h_bytes = n_bytes, n_comp_bytes = 0;
+      bool can_use_hw = CanUseHwDecomp(old_impl, this->cache_->allow_decomp_fallback);
+      if (can_use_hw) {
+        // FIXME(jiamingy): The decomp_ratio is not exposed to the user and we don't yet
+        // have auto configuration for this parameter. We can make it more flexible. More
+        // profiling is needed.
+        bool specified = std::isnan(this->cache_->hw_decomp_ratio);
+        auto hw_decomp_ratio = specified ? 0.4f : this->cache_->hw_decomp_ratio;
+        CHECK_LE(hw_decomp_ratio, 1.0);
+        CHECK_GE(hw_decomp_ratio, 0.0);
+        n_comp_bytes = n_bytes * hw_decomp_ratio;
+        n_h_bytes = n_bytes - n_comp_bytes;
+      }
+      CHECK_EQ(n_bytes, n_h_bytes + n_comp_bytes);
+
+      // Normal host cache
       new_impl->gidx_buffer =
-          common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(n_bytes);
-      if (n_bytes > 0) {
+          common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(n_h_bytes);
+      if (n_h_bytes > 0) {
         dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), old_impl->gidx_buffer.data(),
-                                      n_bytes, cudaMemcpyDefault));
+                                      n_h_bytes, cudaMemcpyDefault));
       }
 
+      // Compressed host cache
+      dh::DeviceUVector<std::uint8_t> tmp;
+      dc::CuMemParams c_out;
+      std::size_t constexpr kChunkSize = 1ul << 21;
+      auto params = dc::CompressSnappy(
+          &ctx, old_impl->gidx_buffer.ToSpan().subspan(n_h_bytes, n_comp_bytes), &tmp, kChunkSize);
+      common::RefResourceView<std::uint8_t> c_buf = dc::CoalesceCompressedBuffersToHost(
+          ctx.CUDACtx()->Stream(), this->cache_->pool, params, tmp, &c_out);
+      auto c_page = dc::MakeSnappyDecomprMgr(ctx.CUDACtx()->Stream(), this->cache_->pool,
+                                             std::move(c_out), c_buf.ToSpan());
+      CHECK_EQ(c_page.DecompressedBytes() + new_impl->gidx_buffer.size_bytes(), n_bytes);
+
       // Device cache
       auto remaining = old_impl->gidx_buffer.size_bytes() - n_bytes;
       auto d_page = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(remaining);
@@ -169,20 +227,24 @@ class EllpackHostCacheStreamImpl {
                                       remaining, cudaMemcpyDefault));
       }
       CHECK_LE(new_impl->gidx_buffer.size(), old_impl->gidx_buffer.size());
-      CHECK_EQ(new_impl->MemCostBytes() + d_page.size_bytes(), old_impl->MemCostBytes());
+      CHECK_EQ(new_impl->MemCostBytes() + d_page.size_bytes() + c_page.DecompressedBytes(),
+               old_impl->MemCostBytes());
       LOG(INFO) << "Create cache page with size:"
-                << common::HumanMemUnit(new_impl->MemCostBytes() + d_page.size_bytes());
-      return std::make_pair(std::move(new_impl), std::move(d_page));
+                << common::HumanMemUnit(new_impl->MemCostBytes() + d_page.size_bytes() +
+                                        c_page.DecompressedBytes());
+      return std::make_tuple(std::move(new_impl), std::move(d_page),
+                             std::make_pair(std::move(c_page), std::move(c_buf)));
     };
 
     if (no_concat) {
       CHECK(new_page);
       auto old_impl = page.Impl();
-      auto [commited, d_page] = commit_page(old_impl);
+      auto [commited, d_page, c_page] = commit_page(old_impl);
 
       this->cache_->offsets.push_back(old_impl->n_rows * old_impl->info.row_stride);
       this->cache_->h_pages.emplace_back(std::move(commited));
       this->cache_->d_pages.emplace_back(std::move(d_page));
+      this->cache_->c_pages.emplace_back(std::move(c_page));
       return new_page;
     }
 
@@ -208,9 +270,10 @@ class EllpackHostCacheStreamImpl {
       // Make sure we can always access the back of the vectors
       this->cache_->h_pages.emplace_back(std::move(new_impl));
       this->cache_->d_pages.emplace_back();
+      this->cache_->c_pages.emplace_back();
     } else {
-      // Concatenate into the device pages even though `d_pages` is used. We split the
-      // page at the commit stage.
+      // Concatenate into the device pages even though `d_pages` and `c_pages` are
+      // used. We split the page at the commit stage.
       CHECK(!this->cache_->h_pages.empty());
       CHECK_EQ(cache_idx, this->cache_->h_pages.size() - 1);
       auto& new_impl = this->cache_->h_pages.back();
@@ -224,16 +287,25 @@ class EllpackHostCacheStreamImpl {
     }
 
     CHECK_EQ(this->cache_->h_pages.size(), this->cache_->d_pages.size());
+    CHECK_EQ(this->cache_->h_pages.size(), this->cache_->c_pages.size());
     return new_page;
   }
 
-  void Read(EllpackPage* out, bool prefetch_copy) const {
+  void Read(Context const* ctx, EllpackPage* out, bool prefetch_copy) const {
     CHECK_EQ(this->cache_->h_pages.size(), this->cache_->d_pages.size());
-    auto [h_page, d_page] = this->cache_->At(this->ptr_);
+    CHECK_EQ(this->cache_->h_pages.size(), this->cache_->c_pages.size());
+    auto [h_page, d_page, c_page] = this->cache_->At(this->ptr_);
     // Skip copy if the full page is on device
-    bool on_device = h_page->gidx_buffer.empty() && !d_page->empty();
-    auto ctx = Context{}.MakeCUDA(dh::CurrentDevice());
+    bool on_device = (h_page->gidx_buffer.empty() && c_page->first.Empty()) && !d_page->empty();
+
     auto out_impl = out->Impl();
+    // We can't access a compressed page directly.
+    if (!c_page->first.Empty()) {
+      prefetch_copy = true;
+    }
+
+    LOG(DEBUG) << "On device: " << on_device << ", prefetch copy:" << prefetch_copy
+               << ", compressed:" << (!c_page->first.Empty());
     if (on_device) {
       CHECK(h_page->gidx_buffer.empty());
       auto d_res = d_page->Resource();
@@ -241,19 +313,35 @@ class EllpackHostCacheStreamImpl {
           d_res->DataAs<common::CompressedByteT>(), d_page->size(), d_res};
       CHECK(out_impl->d_gidx_buffer.empty());
     } else if (prefetch_copy) {
+      // Copy the data in the same order as written
+      // Normal host cache
       auto n_bytes = this->cache_->GidxSizeBytes(this->ptr_);
       out_impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(n_bytes);
       if (!h_page->gidx_buffer.empty()) {
         dh::safe_cuda(cudaMemcpyAsync(out_impl->gidx_buffer.data(), h_page->gidx_buffer.data(),
                                       h_page->gidx_buffer.size_bytes(), cudaMemcpyDefault,
-                                      ctx.CUDACtx()->Stream()));
+                                      ctx->CUDACtx()->Stream()));
       }
+      // Compressed host cache
+      if (!c_page->first.Empty()) {
+        auto stream = this->cache_->streams->Next();
+        auto out = out_impl->gidx_buffer.ToSpan().subspan(h_page->gidx_buffer.size_bytes(),
+                                                          c_page->first.DecompressedBytes());
+        dc::DecompressSnappy(stream, c_page->first, out, this->cache_->allow_decomp_fallback);
+        dh::CUDAEvent e;
+        e.Record(stream);
+        ctx->CUDACtx()->Stream().Wait(e);
+      }
+      // Device cache
       if (!d_page->empty()) {
-        auto beg = out_impl->gidx_buffer.data() + h_page->gidx_buffer.size();
-        dh::safe_cuda(cudaMemcpyAsync(beg, d_page->data(), d_page->size_bytes(), cudaMemcpyDefault,
-                                      ctx.CUDACtx()->Stream()));
+        auto out = out_impl->gidx_buffer.ToSpan().subspan(h_page->gidx_buffer.size_bytes() +
+                                                          c_page->first.DecompressedBytes());
+        CHECK_EQ(out.size_bytes(), d_page->size_bytes());
+        dh::safe_cuda(cudaMemcpyAsync(out.data(), d_page->data(), d_page->size_bytes(),
+                                      cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
       }
     } else {
+      // Direct access
       auto h_res = h_page->gidx_buffer.Resource();
       CHECK(h_res->DataAs<common::CompressedByteT>() == h_page->gidx_buffer.data());
       out_impl->gidx_buffer = common::RefResourceView<common::CompressedByteT>{
@@ -283,8 +371,8 @@ std::shared_ptr<EllpackMemCache const> EllpackHostCacheStream::Share() const {
 
 void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek(offset_bytes); }
 
-void EllpackHostCacheStream::Read(EllpackPage* page, bool prefetch_copy) const {
-  this->p_impl_->Read(page, prefetch_copy);
+void EllpackHostCacheStream::Read(Context const* ctx, EllpackPage* page, bool prefetch_copy) const {
+  this->p_impl_->Read(ctx, page, prefetch_copy);
 }
 
 [[nodiscard]] bool EllpackHostCacheStream::Write(EllpackPage const& page) {
@@ -384,7 +472,7 @@ void CalcCacheMapping(Context const* ctx, bool is_dense,
   std::vector<std::size_t> cache_rows;
 
   for (std::size_t i = 0; i < ext_info.n_batches; ++i) {
-    auto n_samples = ext_info.base_rowids[i+1] - ext_info.base_rowids[i];
+    auto n_samples = ext_info.base_rowids[i + 1] - ext_info.base_rowids[i];
     auto n_bytes = common::CompressedBufferWriter::CalculateBufferSize(
         ext_info.row_stride * n_samples, ell_info.n_symbols);
 
@@ -490,12 +578,12 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
                                              this->GetCuts()};
       this->info_->Extend(proxy_->Info(), false, true);
     });
-    LOG(INFO) << "Generated an Ellpack page with size: "
-              << common::HumanMemUnit(this->page_->Impl()->MemCostBytes())
-              << " from an batch with estimated size: "
-              << cuda_impl::Dispatch<false>(proxy_, [](auto const& adapter) {
-                   return common::HumanMemUnit(adapter->SizeBytes());
-                 });
+    LOG(DEBUG) << "Generated an Ellpack page with size: "
+               << common::HumanMemUnit(this->page_->Impl()->MemCostBytes())
+               << " from an batch with estimated size: "
+               << cuda_impl::Dispatch<false>(proxy_, [](auto const& adapter) {
+                    return common::HumanMemUnit(adapter->SizeBytes());
+                  });
     this->page_->SetBaseRowId(this->ext_info_.base_rowids.at(iter));
     this->WriteCache();
   }
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 09e98afb4326..a76c1d59ad8c 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -8,30 +8,40 @@
 #include <cstdint>  // for int32_t
 #include <limits>   // for numeric_limits
 #include <memory>   // for shared_ptr
+#include <tuple>    // for tuple
 #include <utility>  // for move
 #include <vector>   // for vector
 
 #include "../common/compressed_iterator.h"  // for CompressedByteT
 #include "../common/cuda_rt_utils.h"        // for SupportsPageableMem, SupportsAts
-#include "../common/hist_util.h"            // for HistogramCuts
-#include "../common/ref_resource_view.h"    // for RefResourceView
-#include "ellpack_page.h"                   // for EllpackPage
-#include "ellpack_page_raw_format.h"        // for EllpackPageRawFormat
-#include "sparse_page_source.h"             // for PageSourceIncMixIn
-#include "xgboost/base.h"                   // for bst_idx_t
-#include "xgboost/context.h"                // for DeviceOrd
-#include "xgboost/data.h"                   // for BatchParam
-#include "xgboost/span.h"                   // for Span
+#include "../common/device_compression.h"
+#include "../common/hist_util.h"          // for HistogramCuts
+#include "../common/ref_resource_view.h"  // for RefResourceView
+#include "../data/batch_utils.h"          // for AutoHostRatio
+#include "ellpack_page.h"                 // for EllpackPage
+#include "ellpack_page_raw_format.h"      // for EllpackPageRawFormat
+#include "sparse_page_source.h"           // for PageSourceIncMixIn
+#include "xgboost/base.h"                 // for bst_idx_t
+#include "xgboost/context.h"              // for DeviceOrd
+#include "xgboost/data.h"                 // for BatchParam
+#include "xgboost/span.h"                 // for Span
 
 namespace xgboost::curt {
 class StreamPool;
 }
+namespace xgboost::common::cuda_impl {
+class HostPinnedMemPool;
+}  // namespace xgboost::common::cuda_impl
 
 namespace xgboost::data {
 struct EllpackCacheInfo {
   BatchParam param;
-  double cache_host_ratio{1.0};  // The size ratio the host cache vs. the total cache
+  // The size ratio the host cache vs. the total cache
+  double cache_host_ratio{::xgboost::cuda_impl::AutoHostRatio()};
   float missing{std::numeric_limits<float>::quiet_NaN()};
+  // The ratio of the cache that can be compressed. Used for testing.
+  float hw_decomp_ratio{std::numeric_limits<float>::quiet_NaN()};
+  bool allow_decomp_fallback{false};
   std::vector<bst_idx_t> cache_mapping;
   std::vector<bst_idx_t> buffer_bytes;  // N bytes of the concatenated pages.
   std::vector<bst_idx_t> buffer_rows;
@@ -39,6 +49,12 @@ struct EllpackCacheInfo {
   EllpackCacheInfo() = default;
   EllpackCacheInfo(BatchParam param, double h_ratio, float missing)
       : param{std::move(param)}, cache_host_ratio{h_ratio}, missing{missing} {}
+  EllpackCacheInfo(BatchParam param, ExtMemConfig const& config)
+      : param{std::move(param)},
+        cache_host_ratio{config.cache_host_ratio},
+        missing{config.missing},
+        hw_decomp_ratio{config.hw_decomp_ratio},
+        allow_decomp_fallback{config.allow_decomp_fallback} {}
 
   // Only effective for host-based cache.
   // The number of batches for the concatenated cache.
@@ -57,8 +73,13 @@ struct EllpackMemCache {
   // The device portion of each page.
   using DPage = common::RefResourceView<common::CompressedByteT>;
   std::vector<DPage> d_pages;
-  using PagePtr = std::pair<EllpackPageImpl const*, DPage const*>;
-  using PageRef = std::pair<std::unique_ptr<EllpackPageImpl>&, DPage&>;
+  // Storage for decompression parameters and the compressed buffer.
+  using CPage = std::pair<dc::SnappyDecomprMgr, common::RefResourceView<std::uint8_t>>;
+  // Compressed host page.
+  std::vector<CPage> c_pages;
+
+  using PagePtr = std::tuple<EllpackPageImpl const*, DPage const*, CPage const*>;
+  using PageRef = std::tuple<std::unique_ptr<EllpackPageImpl>&, DPage&, CPage&>;
 
   std::vector<std::size_t> offsets;
   // Size of each batch before concatenation.
@@ -69,8 +90,11 @@ struct EllpackMemCache {
   std::vector<std::size_t> const buffer_bytes;
   std::vector<bst_idx_t> const buffer_rows;
   double const cache_host_ratio;
+  float const hw_decomp_ratio;
+  bool const allow_decomp_fallback;
 
-  std::unique_ptr<curt::StreamPool> streams;
+  std::unique_ptr<curt::StreamPool> streams;  // For decompression
+  std::shared_ptr<common::cuda_impl::HostPinnedMemPool> pool;
 
   explicit EllpackMemCache(EllpackCacheInfo cinfo, std::int32_t n_workers);
   ~EllpackMemCache();
@@ -130,7 +154,7 @@ class EllpackHostCacheStream {
    * @param page[out] The returned page.
    * @param prefetch_copy[in] Does the stream need to copy the page?
    */
-  void Read(EllpackPage* page, bool prefetch_copy) const;
+  void Read(Context const* ctx, EllpackPage* page, bool prefetch_copy) const;
   /**
    * @brief Add a new page to the host cache.
    *
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
index afb746e6eabd..fa6daf249123 100644
--- a/src/data/extmem_quantile_dmatrix.cu
+++ b/src/data/extmem_quantile_dmatrix.cu
@@ -61,7 +61,7 @@ void ExtMemQuantileDMatrix::InitFromCUDA(
    * Calculate cache info
    */
   auto is_validation = (ref != nullptr);
-  auto cinfo = EllpackCacheInfo{p, config.cache_host_ratio, config.missing};
+  auto cinfo = EllpackCacheInfo{p, config};
   CalcCacheMapping(ctx, this->info_.IsDense(), cuts, config.min_cache_page_bytes, ext_info,
                    is_validation, &cinfo);
   CHECK_EQ(cinfo.cache_mapping.size(), ext_info.n_batches);
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index d93e1ca8d3ef..f407660fd80a 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -234,4 +234,4 @@ TEST(IO, CmdOutput) {
   ASSERT_EQ(output, R"(HelloWorld
 )");
 }
-}  // namespace xgboost::common
\ No newline at end of file
+}  // namespace xgboost::common
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index 2fede5de9eb9..2efcf1a03480 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -167,4 +167,77 @@ TEST(EllpackHostCacheTest, Accessor) {
     }
   }
 }
+
+class EllpackDecompTest : public ::testing::TestWithParam<float> {
+ public:
+  void Run(float hw_decomp_ratio) {
+    auto ctx = MakeCUDACtx(0);
+    auto param = BatchParam{128, tree::TrainParam::DftSparseThreshold()};
+    std::size_t n_samples = 8192, n_features = 512;
+    float sparsity = 0.6;
+    auto full_p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
+                           .Batches(4)
+                           .Bins(param.max_bin)
+                           .Device(ctx.Device())
+                           .HwDecompRatio(0.0)
+                           .OnHost(true)
+                           .MinPageCacheBytes(n_samples * n_features / 4)
+                           .CacheHostRatio(0.8)
+                           .GenerateExtMemQuantileDMatrix("temp", false);
+
+    auto comp_p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
+                           .Batches(4)
+                           .Bins(param.max_bin)
+                           .Device(ctx.Device())
+                           .HwDecompRatio(hw_decomp_ratio)
+                           .OnHost(true)
+                           .MinPageCacheBytes(n_samples * n_features / 4)
+                           .CacheHostRatio(0.8)
+                           .GenerateExtMemQuantileDMatrix("temp", false);
+
+    auto get_pages = [&](std::shared_ptr<DMatrix> p_fmat) {
+      std::vector<std::shared_ptr<EllpackPage const>> pages;
+      auto it = p_fmat->GetBatches<EllpackPage>(&ctx, param).begin();
+      while (!it.AtEnd()) {
+        auto page = it.Page();
+        EXPECT_FALSE(page->Impl()->IsDenseCompressed());
+        pages.emplace_back(std::move(page));
+        ++it;
+      }
+      return pages;
+    };
+
+    std::vector<std::shared_ptr<EllpackPage const>> full_pages = get_pages(full_p_fmat);
+    std::vector<std::shared_ptr<EllpackPage const>> comp_pages = get_pages(comp_p_fmat);
+
+    ASSERT_EQ(full_pages.size(), comp_pages.size());
+    for (std::size_t i = 0, n = full_pages.size(); i < n; ++i) {
+      auto impl_f = full_pages[i]->Impl();
+      auto impl_c = comp_pages[i]->Impl();
+      ASSERT_EQ(impl_f->gidx_buffer.size(), impl_c->gidx_buffer.size());
+      ASSERT_EQ(impl_f->d_gidx_buffer.size(), impl_c->d_gidx_buffer.size());
+      ASSERT_EQ(impl_f->NumNonMissing(&ctx, {}), impl_c->NumNonMissing(&ctx, {}));
+
+      std::vector<common::CompressedByteT> buf_f;
+      [[maybe_unused]] auto acc_f = impl_f->GetHostEllpack(&ctx, &buf_f);
+
+      std::vector<common::CompressedByteT> buf_c;
+      [[maybe_unused]] auto acc_c = impl_c->GetHostEllpack(&ctx, &buf_c);
+
+      ASSERT_EQ(buf_f.size(), buf_c.size());
+      for (std::size_t i = 0, m = buf_f.size(); i < m; ++i) {
+        ASSERT_EQ(buf_f[i], buf_c[i]) << i;
+      }
+    }
+  }
+};
+
+TEST_P(EllpackDecompTest, Basic) {
+  auto ctx = MakeCUDACtx(0);
+  auto hw_decomp_ratio = this->GetParam();
+  this->Run(hw_decomp_ratio);
+}
+
+INSTANTIATE_TEST_SUITE_P(ExtMemQuantileDMatrix, EllpackDecompTest,
+                         ::testing::Values(1.0f, 0.1f, 0.5f, 0.0f));
 }  // namespace xgboost::data
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 05242bdc8f0c..2db3201e474b 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -201,7 +201,6 @@ double GetMultiMetricEval(xgboost::Metric* metric,
 }
 
 namespace xgboost {
-
 float GetBaseScore(Json const &config) {
   return std::stof(get<String const>(config["learner"]["learner_model_param"]["base_score"]));
 }
@@ -408,6 +407,15 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
     out->Info().feature_types.ConstDevicePointer();
   }
 }
+
+[[nodiscard]] bool DecompAllowFallback() {
+#if defined(XGBOOST_USE_NVCOMP)
+  bool allow_decomp_fallback = true;
+#else
+  bool allow_decomp_fallback = false;
+#endif
+  return allow_decomp_fallback;
+}
 }  // namespace
 
 [[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(
@@ -464,14 +472,16 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
 #endif  // defined(XGBOOST_USE_CUDA)
   }
 
-  auto config = ExtMemConfig{
-      prefix,
-      this->on_host_,
-      this->cache_host_ratio_,
-      this->min_cache_page_bytes_,
-      std::numeric_limits<float>::quiet_NaN(),
-      Context{}.Threads(),
-  };
+  auto config =
+      ExtMemConfig{
+          prefix,
+          this->on_host_,
+          this->cache_host_ratio_,
+          this->min_cache_page_bytes_,
+          std::numeric_limits<float>::quiet_NaN(),
+          Context{}.Threads(),
+      }
+          .SetParamsForTest(this->hw_decomp_ratio_, DecompAllowFallback());
   std::shared_ptr<DMatrix> p_fmat{
       DMatrix::Create(static_cast<DataIterHandle>(iter.get()), iter->Proxy(), Reset, Next, config)};
 
@@ -514,14 +524,17 @@ void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
   }
   CHECK(iter);
 
-  auto config = ExtMemConfig{
-      prefix,
-      this->on_host_,
-      this->cache_host_ratio_,
-      this->min_cache_page_bytes_,
-      std::numeric_limits<float>::quiet_NaN(),
-      Context{}.Threads(),
-  };
+  auto config =
+      ExtMemConfig{
+          prefix,
+          this->on_host_,
+          this->cache_host_ratio_,
+          this->min_cache_page_bytes_,
+          std::numeric_limits<float>::quiet_NaN(),
+          Context{}.Threads(),
+      }
+          .SetParamsForTest(this->hw_decomp_ratio_, DecompAllowFallback());
+
   std::shared_ptr<DMatrix> p_fmat{
       DMatrix::Create(static_cast<DataIterHandle>(iter.get()), iter->Proxy(), this->ref_, Reset,
                       Next, this->bins_, std::numeric_limits<std::int64_t>::max(), config)};
@@ -718,6 +731,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) {
   for (int i = 0; i < ptr->n_gpu; ++i) {
     rmm::mr::set_per_device_resource(rmm::cuda_device_id(i), ptr->pool_mr[i].get());
   }
+  GlobalConfigThreadLocalStore::Get()->UpdateAllowUnknown(Args{{"use_rmm", "true"}});
   return ptr;
 }
 #else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index f8ab9b77ebae..6589bd95f176 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -243,6 +243,7 @@ class RandomDataGenerator {
   std::shared_ptr<DMatrix> ref_{nullptr};
   std::int64_t min_cache_page_bytes_{0};
   float cache_host_ratio_;
+  float hw_decomp_ratio_{true};
 
   Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;
 
@@ -283,6 +284,10 @@ class RandomDataGenerator {
     this->cache_host_ratio_ = cache_host_ratio;
     return *this;
   }
+  [[nodiscard]] RandomDataGenerator& HwDecompRatio(float hw_decomp_ratio) {
+    this->hw_decomp_ratio_ = hw_decomp_ratio;
+    return *this;
+  }
   RandomDataGenerator& Seed(uint64_t s) {
     seed_ = s;
     lcg_.Seed(seed_);

From 06ed14673ff01880821fcbb1b0e6e3c2600dec76 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Thu, 26 Jun 2025 06:01:56 +0200
Subject: [PATCH 076/224] [sycl] Fallback to CPU in multiclass_obj for fp32
 devices. (#11527)

---
 include/xgboost/context.h                    |  7 ++++++
 plugin/sycl/context_helper.cc                | 26 ++++++++++++++++++++
 plugin/sycl/context_helper.h                 | 17 +++++++++++++
 src/context.cc                               | 12 +++++++++
 src/objective/multiclass_obj.cu              |  2 +-
 tests/cpp/plugin/test_sycl_multiclass_obj.cc |  2 +-
 6 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 plugin/sycl/context_helper.cc
 create mode 100644 plugin/sycl/context_helper.h

diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index f32a07a033a1..639f0f173a55 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -206,6 +206,13 @@ struct Context : public XGBoostParameter<Context> {
    * @brief Get the current device and ordinal.
    */
   [[nodiscard]] DeviceOrd Device() const { return device_; }
+
+   /**
+   * @brief Get the current device and ordinal, if it supports fp64,
+            otherwise returns default CPU
+   */
+  [[nodiscard]] DeviceOrd DeviceFP64() const;
+
   /**
    * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
    */
diff --git a/plugin/sycl/context_helper.cc b/plugin/sycl/context_helper.cc
new file mode 100644
index 000000000000..d5ced146187c
--- /dev/null
+++ b/plugin/sycl/context_helper.cc
@@ -0,0 +1,26 @@
+/*!
+ * Copyright 2017-2025 by Contributors
+ * \file context_helper.cc
+ */
+
+#include <sycl/sycl.hpp>
+
+
+#include "device_manager.h"
+#include "context_helper.h"
+
+namespace xgboost {
+namespace sycl {
+
+DeviceOrd DeviceFP64(const DeviceOrd& device) {
+  DeviceManager device_manager;
+  bool support_fp64 = device_manager.GetQueue(device)->get_device().has(::sycl::aspect::fp64);
+  if (support_fp64) {
+    return device;
+  } else {
+    LOG(WARNING) << "Current device doesn't support fp64";
+    return DeviceOrd::CPU();
+  }
+}
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/sycl/context_helper.h b/plugin/sycl/context_helper.h
new file mode 100644
index 000000000000..09d9a419fc5f
--- /dev/null
+++ b/plugin/sycl/context_helper.h
@@ -0,0 +1,17 @@
+/**
+ * Copyright 2021-2025, XGBoost Contributors
+ * \file context_helper.h
+ */
+#ifndef PLUGIN_SYCL_CONTEXT_HELPER_H_
+#define PLUGIN_SYCL_CONTEXT_HELPER_H_
+
+#include <xgboost/context.h>
+
+namespace xgboost {
+namespace sycl {
+
+DeviceOrd DeviceFP64(const DeviceOrd& device);
+
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_CONTEXT_HELPER_H_
diff --git a/src/context.cc b/src/context.cc
index 634c593cf3bc..d4fd21acf694 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -22,6 +22,10 @@
 
 #endif  // !defined(XGBOOST_USE_CUDA)
 
+#if defined(XGBOOST_USE_SYCL)
+#include "../plugin/sycl/context_helper.h"
+#endif  // defined (XGBOOST_USE_SYCL)
+
 namespace xgboost {
 
 DMLC_REGISTER_PARAMETER(Context);
@@ -282,6 +286,14 @@ std::int32_t Context::Threads() const {
   return n_threads;
 }
 
+DeviceOrd Context::DeviceFP64() const {
+  #if defined(XGBOOST_USE_SYCL)
+    return sycl::DeviceFP64(device_);
+  #else
+    return device_;
+  #endif  // defined(XGBOOST_USE_SYCL)
+}
+
 #if !defined(XGBOOST_USE_CUDA)
 CUDAContext const* Context::CUDACtx() const {
   common::AssertGPUSupport();
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 1a3df38841bd..46837492178d 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -56,7 +56,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
 
-    auto device = ctx_->Device();
+    auto device = ctx_->DeviceFP64();
     out_gpair->SetDevice(device);
     info.labels.SetDevice(device);
     info.weights_.SetDevice(device);
diff --git a/tests/cpp/plugin/test_sycl_multiclass_obj.cc b/tests/cpp/plugin/test_sycl_multiclass_obj.cc
index d306337ac599..a46565df53bc 100644
--- a/tests/cpp/plugin/test_sycl_multiclass_obj.cc
+++ b/tests/cpp/plugin/test_sycl_multiclass_obj.cc
@@ -21,7 +21,7 @@ TEST(SyclObjective, SoftmaxMultiClassObjGPair) {
 TEST(SyclObjective, SoftmaxMultiClassBasic) {
   Context ctx;
   ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
-  TestSoftmaxMultiClassObjGPair(&ctx);
+  TestSoftmaxMultiClassBasic(&ctx);
 }
 
 TEST(SyclObjective, SoftprobMultiClassBasic) {

From a7999da3fbf763380ddabbf14d871f3a8c20d46f Mon Sep 17 00:00:00 2001
From: Daniel Patterson <daniel@mapbox.com>
Date: Thu, 26 Jun 2025 00:25:13 -0700
Subject: [PATCH 077/224] Fix static linking C++ libraries on macOS (#11522)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 579f89acffd7..b10fb5d258d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -416,7 +416,7 @@ if(JVM_BINDINGS)
   xgboost_target_defs(xgboost4j)
 endif()
 
-if(USE_OPENMP AND APPLE)
+if(USE_OPENMP AND APPLE AND NOT BUILD_STATIC_LIB)
   patch_openmp_path_macos(xgboost libxgboost)
 endif()
 

From 2e5225cb9f374008dc03a0eed9e06b155bd3c709 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 26 Jun 2025 15:39:06 +0800
Subject: [PATCH 078/224] Fix NDCG metric with non-exp gain. (#11534)

---
 src/metric/rank_metric.cc          | 20 +++++++++++++-------
 tests/cpp/common/test_parameter.cc | 20 +++++++++++++++++---
 tests/python/test_ranking.py       | 22 ++++++++++++++++++++++
 3 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index d0247c14b3d2..a7efc8e70936 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost contributors
+ * Copyright 2020-2025, XGBoost contributors
  */
 #include "rank_metric.h"
 
@@ -10,7 +10,6 @@
 #include <array>                             // for array
 #include <cmath>                             // for log, sqrt
 #include <functional>                        // for less, greater
-#include <limits>                            // for numeric_limits
 #include <map>                               // for operator!=, _Rb_tree_const_iterator
 #include <memory>                            // for allocator, unique_ptr, shared_ptr, __shared_...
 #include <numeric>                           // for accumulate
@@ -22,7 +21,6 @@
 #include "../collective/aggregator.h"        // for ApplyWithLabels
 #include "../common/algorithm.h"             // for ArgSort, Sort
 #include "../common/linalg_op.h"             // for cbegin, cend
-#include "../common/math.h"                  // for CmpFirst
 #include "../common/optional_weight.h"       // for OptionalWeights, MakeOptionalWeights
 #include "dmlc/common.h"                     // for OMPException
 #include "metric_common.h"                   // for MetricNoCache, GPUMetric, PackedReduceResult
@@ -250,10 +248,6 @@ class EvalRankWithCache : public Metric {
     }
     param_.UpdateAllowUnknown(Args{});
   }
-  void Configure(Args const&) override {
-    // do not configure, otherwise the ndcg param will be forced into the same as the one in
-    // objective.
-  }
   void LoadConfig(Json const& in) override {
     if (IsA<Null>(in)) {
       return;
@@ -365,6 +359,18 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
  public:
   using EvalRankWithCache::EvalRankWithCache;
 
+  void Configure(Args const& args) override {
+    // do not configure, otherwise the ndcg param like top-k will be forced into the same
+    // as the one in objective. The metric has its own syntax for parameter.
+    for (auto const& [key, value] : args) {
+      // Make a special case for the exp gain parameter, which is not exposed in the
+      // metric configuration syntax.
+      if (key == "ndcg_exp_gain") {
+        this->param_.UpdateAllowUnknown(Args{{key, value}});
+      }
+    }
+  }
+
   double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
               std::shared_ptr<ltr::NDCGCache> p_cache) override {
     if (ctx_->IsCUDA()) {
diff --git a/tests/cpp/common/test_parameter.cc b/tests/cpp/common/test_parameter.cc
index 5288366f8831..bcd1dc98f6f2 100644
--- a/tests/cpp/common/test_parameter.cc
+++ b/tests/cpp/common/test_parameter.cc
@@ -1,11 +1,12 @@
-/*!
- * Copyright (c) by Contributors 2019
+/**
+ * Copyright 2019-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
-
 #include <xgboost/base.h>
 #include <xgboost/parameter.h>
 
+#include "xgboost/json.h"  // for ToJson, FromJson
+
 enum class Foo : int {
   kBar = 0, kFrog = 1, kCat = 2, kDog = 3
 };
@@ -103,3 +104,16 @@ TEST(XGBoostParameter, Update) {
   a.UpdateAllowUnknown(xgboost::Args{{"f", "2.71828"}});
   ASSERT_NE(a.f, b.f);
 }
+namespace xgboost {
+TEST(XGBoostParameter, Json) {
+  UpdatableParam a, b;
+  a.UpdateAllowUnknown(Args{{"f", "1024"}, {"d", "2048"}});
+  auto ja = Json{ToJson(a)};
+
+  UpdatableParam c;
+  FromJson(ja, &c);
+  ASSERT_FLOAT_EQ(a.f, 1024);
+  ASSERT_FLOAT_EQ(c.f, 1024);
+  ASSERT_FLOAT_EQ(b.f, 0);  // Make sure dmlc global variable is not used here.
+}
+}  // namespace xgboost
diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py
index ff1109ddc3d8..fad4e04ece84 100644
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@@ -69,6 +69,28 @@ def ndcg_gain(y: np.ndarray) -> np.ndarray:
     )
 
 
+def test_ndcg_non_exp() -> None:
+    # NDCG exp gain must have label smaller than 32
+    X, y, q, w = tm.make_ltr(n_samples=1024, n_features=4, n_query_groups=3, max_rel=44)
+
+    def fit(ltr: xgboost.XGBRanker):
+        ltr.fit(
+            X,
+            y,
+            qid=q,
+            sample_weight=w,
+            eval_set=[(X, y)],
+            eval_qid=(q,),
+            sample_weight_eval_set=(w,),
+        )
+
+    ltr = xgboost.XGBRanker(tree_method="hist", ndcg_exp_gain=True, n_estimators=2)
+    with pytest.raises(ValueError, match="Set `ndcg_exp_gain`"):
+        fit(ltr)
+    ltr = xgboost.XGBRanker(tree_method="hist", ndcg_exp_gain=False, n_estimators=2)
+    fit(ltr)
+
+
 def test_ranking_with_unweighted_data():
     Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
     Xcol = np.array([0, 0, 1, 1,  2,  2,  3,  3])

From 6deb2be3b543ef8f12b6adfbcc029eb13faa6abb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 26 Jun 2025 16:42:51 +0800
Subject: [PATCH 079/224] [doc] Small cleanup and updates for documentation.
 (#11532)

- Update broken links.
- Merge GPU demos into Python demos. We already use GPU in python demos, no need to have a
  separate category.
- Fix GPU cover type demo.
---
 demo/gpu_acceleration/README.rst                          | 8 --------
 demo/{gpu_acceleration => guide-python}/cover_type.py     | 1 +
 .../tree_shap.py => guide-python/gpu_tree_shap.py}        | 0
 doc/conf.py                                               | 2 --
 doc/contrib/release.rst                                   | 2 +-
 doc/gpu/index.rst                                         | 2 +-
 doc/python/index.rst                                      | 1 -
 doc/tutorials/ray.rst                                     | 4 ++--
 8 files changed, 5 insertions(+), 15 deletions(-)
 delete mode 100644 demo/gpu_acceleration/README.rst
 rename demo/{gpu_acceleration => guide-python}/cover_type.py (96%)
 rename demo/{gpu_acceleration/tree_shap.py => guide-python/gpu_tree_shap.py} (100%)

diff --git a/demo/gpu_acceleration/README.rst b/demo/gpu_acceleration/README.rst
deleted file mode 100644
index 77bd221d1807..000000000000
--- a/demo/gpu_acceleration/README.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-:orphan:
-
-GPU Acceleration Demo
-=====================
-
-This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
-see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
-training using dask or spark.
diff --git a/demo/gpu_acceleration/cover_type.py b/demo/guide-python/cover_type.py
similarity index 96%
rename from demo/gpu_acceleration/cover_type.py
rename to demo/guide-python/cover_type.py
index a582aaad3c60..20d2a81d0b45 100644
--- a/demo/gpu_acceleration/cover_type.py
+++ b/demo/guide-python/cover_type.py
@@ -45,5 +45,6 @@
 # Repeat for CPU algorithm
 clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
 start = time.time()
+clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
 cpu_res = clf.evals_result()
 print("CPU Training Time: %s seconds" % (str(time.time() - start)))
diff --git a/demo/gpu_acceleration/tree_shap.py b/demo/guide-python/gpu_tree_shap.py
similarity index 100%
rename from demo/gpu_acceleration/tree_shap.py
rename to demo/guide-python/gpu_tree_shap.py
diff --git a/doc/conf.py b/doc/conf.py
index 223c8f70a53d..61f75c61e43b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -250,7 +250,6 @@ def is_readthedocs_build():
         "../demo/guide-python",
         "../demo/dask",
         "../demo/aft_survival",
-        "../demo/gpu_acceleration",
         "../demo/rmm_plugin",
     ],
     # path to where to save gallery generated output
@@ -258,7 +257,6 @@ def is_readthedocs_build():
         "python/examples",
         "python/dask-examples",
         "python/survival-examples",
-        "python/gpu-examples",
         "python/rmm-examples",
     ],
     "matplotlib_animations": True,
diff --git a/doc/contrib/release.rst b/doc/contrib/release.rst
index 25f291b1213d..61735341f725 100644
--- a/doc/contrib/release.rst
+++ b/doc/contrib/release.rst
@@ -42,7 +42,7 @@ with a new release branch.
 
 R CRAN Package
 --------------
-Before submitting a release, one should test the package on `R-hub <https://builder.r-hub.io/>`__ and `win-builder <https://win-builder.r-project.org/>`__ first.  Please note that the R-hub Windows instance doesn't have the exact same environment as the one hosted on win-builder.
+Before submitting a release, one should test the package on `R-hub <https://r-hub.github.io/rhub/>`__ and `win-builder <https://win-builder.r-project.org/>`__ first.  Please note that the R-hub Windows instance doesn't have the exact same environment as the one hosted on win-builder.
 
 According to the `CRAN policy <https://cran.r-project.org/web/packages/policies.html>`__:
 
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index a6910d64267f..8366068a762c 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -43,7 +43,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
   shap_values = booster.predict(dtrain, pred_contribs=True)
   shap_interaction_values = model.predict(dtrain, pred_interactions=True)
 
-See :ref:`sphx_glr_python_gpu-examples_tree_shap.py` for a worked example.
+See :ref:`sphx_glr_python_examples_gpu_tree_shap.py` for a worked example.
 
 Multi-node Multi-GPU Training
 =============================
diff --git a/doc/python/index.rst b/doc/python/index.rst
index 28169a4efdd1..ad557f96e101 100644
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@@ -16,5 +16,4 @@ Contents
   examples/index
   dask-examples/index
   survival-examples/index
-  gpu-examples/index
   rmm-examples/index
diff --git a/doc/tutorials/ray.rst b/doc/tutorials/ray.rst
index 1e9bc3fafa43..b6e5f7c7b297 100644
--- a/doc/tutorials/ray.rst
+++ b/doc/tutorials/ray.rst
@@ -250,8 +250,8 @@ a few things you need to do:
 
 
 Ray Tune supports various
-`search algorithms and libraries (e.g. BayesOpt, Tree-Parzen estimators) <https://docs.ray.io/en/latest/tune/key-concepts.html#search-algorithms>`_,
-`smart schedulers like successive halving <https://docs.ray.io/en/latest/tune/key-concepts.html#trial-schedulers>`_,
+`search algorithms and libraries (e.g. BayesOpt, Tree-Parzen estimators) <https://docs.ray.io/en/latest/tune/key-concepts.html#tune-search-algorithms>`_,
+`smart schedulers like successive halving <https://docs.ray.io/en/latest/tune/key-concepts.html#tune-schedulers>`_,
 and other features. Please refer to the `Ray Tune documentation <http://tune.io>`_
 for more information.
 

From 32e5976283801b7738ade641630cc5aadee0086e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 1 Jul 2025 15:19:26 +0800
Subject: [PATCH 080/224] Avoid the use of `restrict` in the evaluator.
 (#11540)

---
 src/tree/gpu_hist/evaluate_splits.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index e8d4f2161b3d..191344193f10 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -110,7 +110,7 @@ class EvaluateSplitAgent {
     return gpair;
   }
 
-  __device__ __forceinline__ void Numerical(DeviceSplitCandidate *__restrict__ best_split) {
+  __device__ __forceinline__ void Numerical(DeviceSplitCandidate * best_split) {
     for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
       GradientPairInt64 bin = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x)
@@ -145,7 +145,7 @@ class EvaluateSplitAgent {
     }
   }
 
-  __device__ __forceinline__ void OneHot(DeviceSplitCandidate *__restrict__ best_split) {
+  __device__ __forceinline__ void OneHot(DeviceSplitCandidate *best_split) {
     for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
 
@@ -181,7 +181,7 @@ class EvaluateSplitAgent {
                                                   bool missing_left, bst_bin_t it,
                                                   GradientPairInt64 const &left_sum,
                                                   GradientPairInt64 const &right_sum,
-                                                  DeviceSplitCandidate *__restrict__ best_split) {
+                                                  DeviceSplitCandidate *best_split) {
     auto gain = thread_active
                     ? evaluator.CalcSplitGain(param, nidx, fidx, rounding.ToFloatingPoint(left_sum),
                                               rounding.ToFloatingPoint(right_sum))
@@ -204,7 +204,7 @@ class EvaluateSplitAgent {
   /**
    * \brief Partition-based split for categorical feature.
    */
-  __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
+  __device__ __forceinline__ void Partition(DeviceSplitCandidate *best_split,
                                             common::Span<bst_feature_t> sorted_idx,
                                             std::size_t node_offset,
                                             GPUTrainingParam const &param) {

From d0fcd6bf736139b55ae08f241d8767ae4f51f3c3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 1 Jul 2025 17:14:02 +0800
Subject: [PATCH 081/224] Remove the use of `gpu_id` in cpp tests. (#11539)

---
 src/common/cuda_dr_utils.cc           |  4 +++
 tests/cpp/gbm/test_gbtree.cc          | 12 ++++----
 tests/cpp/objective/test_objective.cc |  2 +-
 tests/cpp/predictor/test_predictor.cc |  4 +--
 tests/cpp/test_context.cu             | 32 +++++++++++--------
 tests/cpp/test_learner.cc             | 12 ++------
 tests/cpp/test_serialization.cc       | 44 ++++++++++++++++++---------
 7 files changed, 63 insertions(+), 47 deletions(-)

diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index e51a76e00a2d..c2f7cb7d967f 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -47,6 +47,10 @@ CuDriverApi::CuDriverApi(std::int32_t cu_major, std::int32_t cu_minor, std::int3
   } else {
     this->cuMemBatchDecompressAsync = nullptr;
   }
+#else
+  (void)cu_major;
+  (void)cu_minor;
+  (void)kdm_major;
 #endif  // defined(CUDA_HW_DECOM_AVAILABLE)
   CHECK(this->cuMemGetAllocationGranularity);
 }
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 79e236f11a53..a4314e79880a 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -46,10 +46,10 @@ TEST(GBTree, SelectTreeMethod) {
   ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker");
 
 #ifdef XGBOOST_USE_CUDA
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-  gbtree.Configure({{"tree_method", "gpu_hist"}});
+  ctx.UpdateAllowUnknown(Args{{"device", "cuda"}});
+  gbtree.Configure({{"tree_method", "hist"}});
   ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist");
-  gbtree.Configure({{"booster", "dart"}, {"tree_method", "gpu_hist"}});
+  gbtree.Configure({{"booster", "dart"}, {"tree_method", "hist"}});
   ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist");
 #endif  // XGBOOST_USE_CUDA
 }
@@ -128,7 +128,7 @@ TEST(GBTree, ChoosePredictor) {
   p_dmat->Info().labels.Reshape(kRows);
 
   auto learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
-  learner->SetParams(Args{{"tree_method", "gpu_hist"}, {"gpu_id", "0"}});
+  learner->SetParams(Args{{"tree_method", "hist"}, {"device", "cuda"}});
   for (size_t i = 0; i < 4; ++i) {
     learner->UpdateOneIter(i, p_dmat);
   }
@@ -146,7 +146,7 @@ TEST(GBTree, ChoosePredictor) {
     std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
     learner->Load(fi.get());
   }
-  learner->SetParams(Args{{"tree_method", "gpu_hist"}, {"gpu_id", "0"}});
+  learner->SetParams(Args{{"tree_method", "hist"}, {"device", "cuda"}});
   for (size_t i = 0; i < 4; ++i) {
     learner->UpdateOneIter(i, p_dmat);
   }
@@ -162,7 +162,7 @@ TEST(GBTree, ChoosePredictor) {
 
   // another new learner
   learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
-  learner->SetParams(Args{{"tree_method", "gpu_hist"}, {"gpu_id", "0"}});
+  learner->SetParams(Args{{"tree_method", "hist"}, {"device", "cuda"}});
   for (size_t i = 0; i < 4; ++i) {
     learner->UpdateOneIter(i, p_dmat);
   }
diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc
index efdd03612a0f..e06804b84c75 100644
--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@@ -25,7 +25,7 @@ namespace xgboost {
 TEST(Objective, PredTransform) {
   // Test that show PredTransform uses the same device with predictor.
   xgboost::Context tparam;
-  tparam.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  tparam.UpdateAllowUnknown(Args{{"device", "cuda"}});
   size_t n = 100;
 
   for (const auto& entry : ::dmlc::Registry<::xgboost::ObjFunctionReg>::List()) {
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 6e1f1301a530..121b7a768bc4 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -619,7 +619,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
   learner->Configure();
 
   if (ctx->IsCUDA()) {
-    learner->SetParam("tree_method", "gpu_hist");
+    learner->SetParam("tree_method", "hist");
     learner->SetParam("device", ctx->Device().Name());
   }
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
@@ -637,7 +637,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
   }
 
   learner->SetParam("tree_method", "hist");
-  learner->SetParam("gpu_id", "-1");
+  learner->SetParam("device", "cpu");
   // Xcode_12.4 doesn't compile with `std::make_shared`.
   auto dense = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
   auto array_interface = GetArrayInterface(&with_nan, kRows, kCols);
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index 5d8a67c22b05..7f205ab0f849 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -1,7 +1,8 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <thread>         // for thread
 #include <xgboost/base.h>  // for Args
 #include <xgboost/context.h>
 #include <xgboost/json.h>  // for FromJson, ToJson
@@ -79,20 +80,25 @@ TEST(Context, MGPUDeviceOrdinal) {
 
 TEST(Context, MGPUId) {
   Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  ctx.UpdateAllowUnknown(Args{{"device", "cuda"}});
   TestCUDA(ctx, 0);
 
   auto n_vis = curt::AllVisibleGPUs();
-  auto ord = n_vis - 1;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
-  TestCUDA(ctx, ord);
-
-  auto device = "cuda:" + std::to_string(1001);
-  ctx.UpdateAllowUnknown(Args{{"device", device}});
-  ord = 1001 % n_vis;
-  TestCUDA(ctx, ord);
-
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "-1"}});
-  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+  // Use threads to avoid changing the global variable in tests.
+  auto t0 = std::thread{[n_vis] {
+    Context ctx;
+    auto ord = n_vis - 1;
+    ctx.UpdateAllowUnknown(Args{{"device", "cuda:" + std::to_string(ord)}});
+    TestCUDA(ctx, ord);
+  }};
+  auto t1 = std::thread{[n_vis] {
+    Context ctx;
+    auto device = "cuda:" + std::to_string(1001);
+    ctx.UpdateAllowUnknown(Args{{"device", device}});
+    auto ord = 1001 % n_vis;
+    TestCUDA(ctx, ord);
+  }};
+  t0.join();
+  t1.join();
 }
 }  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 88a0ce0644d2..fa012bd184a4 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -337,7 +337,7 @@ TEST(Learner, GPUConfiguration) {
   }
   {
     std::unique_ptr<Learner> learner{Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "gpu_hist"}});
+    learner->SetParams({Arg{"tree_method", "hist"}, {"device", "cuda"}});
     learner->Configure();
     ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
     learner->UpdateOneIter(0, p_dmat);
@@ -345,8 +345,7 @@ TEST(Learner, GPUConfiguration) {
   }
   {
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "gpu_hist"},
-                        Arg{"gpu_id", "-1"}});
+    learner->SetParams({Arg{"tree_method", "hist"}, Arg{"device", "cuda"}});
     learner->UpdateOneIter(0, p_dmat);
     ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
@@ -357,13 +356,6 @@ TEST(Learner, GPUConfiguration) {
     learner->UpdateOneIter(0, p_dmat);
     ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CPU());
   }
-  {
-    // with CPU algorithm, but `gpu_id` takes priority
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"}, Arg{"gpu_id", "0"}});
-    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
-  }
 }
 #endif  // defined(XGBOOST_USE_CUDA)
 
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 0458956333ac..356ec62651ab 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -220,11 +220,13 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     learner->Save(&mem_out);
     ASSERT_EQ(model_at_kiter, serialised_model_tmp);
 
+    // Set the model to device
     for (auto const& [key, value] : args) {
-      if (key == "tree_method" && value == "gpu_hist") {
-        learner->SetParam("gpu_id", "0");
+      if (key == "device") {
+        learner->SetParam(key, value);
       }
     }
+
     // Pull data to device
     for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
       batch.data.SetDevice(DeviceOrd::CUDA(0));
@@ -243,8 +245,10 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     Json m_0 = Json::Load(StringView{model_at_2kiter}, std::ios::binary);
     Json m_1 = Json::Load(StringView{serialised_model_tmp}, std::ios::binary);
     // GPU ID is changed as data is coming from device.
-    ASSERT_EQ(get<Object>(m_0["Config"]["learner"]["generic_param"]).erase("gpu_id"),
-              get<Object>(m_1["Config"]["learner"]["generic_param"]).erase("gpu_id"));
+    get<Object>(m_0["Config"]["learner"]["generic_param"]).erase("device");
+    get<Object>(m_1["Config"]["learner"]["generic_param"]).erase("device");
+    ASSERT_EQ(get<Object>(m_0["Config"]["learner"]["generic_param"]),
+              get<Object>(m_1["Config"]["learner"]["generic_param"]));
   }
 }
 
@@ -368,7 +372,8 @@ TEST_F(SerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "gbtree"},
@@ -376,14 +381,16 @@ TEST_F(SerializationTest, GpuHist) {
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "dart"},
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
 
@@ -399,7 +406,7 @@ TEST_F(SerializationTest, ConfigurationCount) {
   {
     auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
 
-    learner->SetParam("tree_method", "gpu_hist");
+    learner->SetParams(Args{{"tree_method", "hist"}, {"device", "cuda"}});
 
     for (size_t i = 0; i < 10; ++i) {
       learner->UpdateOneIter(i, p_dmat);
@@ -478,7 +485,8 @@ TEST_F(L1SerializationTest, GpuHist) {
                             {"objective", "reg:absoluteerror"},
                             {"seed", "0"},
                             {"max_depth", "2"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
 #endif  //  defined(XGBOOST_USE_CUDA)
@@ -574,7 +582,8 @@ TEST_F(LogitSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "gbtree"},
@@ -583,7 +592,8 @@ TEST_F(LogitSerializationTest, GpuHist) {
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "dart"},
@@ -591,7 +601,8 @@ TEST_F(LogitSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
 
@@ -720,7 +731,8 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             // Mitigate the difference caused by hardware fused multiply
                             // add to tree weight during update prediction cache.
                             {"learning_rate", "1.0"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "gbtree"},
@@ -732,7 +744,8 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             // after num_parallel_tree goes to 4
                             {"num_parallel_tree", "4"},
                             {"learning_rate", "1.0"},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "dart"},
@@ -741,7 +754,8 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             {"nthread", "1"},
                             {"learning_rate", "1.0"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"tree_method", "gpu_hist"}},
+                            {"device", "cuda"},
+                            {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
 

From c421dd03b0b3c724eb1e6987e73461bf7e0d903e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 2 Jul 2025 03:37:14 +0800
Subject: [PATCH 082/224] Remove the deprecated `gpu_id`. (#11543)

---
 python-package/xgboost/spark/core.py      |  1 -
 src/common/error_msg.cc                   |  9 ---------
 src/common/error_msg.h                    |  2 --
 src/context.cc                            | 21 ++++-----------------
 src/learner.cc                            |  1 -
 tests/cpp/gbm/test_gbtree.cc              | 20 ++------------------
 tests/python-gpu/test_gpu_basic_models.py |  4 ++++
 7 files changed, 10 insertions(+), 48 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 1ef59294056d..5b5efd377137 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -142,7 +142,6 @@
 _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.items()}
 
 _unsupported_xgb_params = [
-    "gpu_id",  # we have "device" pyspark param instead.
     "enable_categorical",  # Use feature_types param to specify categorical feature instead
     "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
     "nthread",  # Ditto
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 765b6a6ba31b..4890938f6808 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -37,15 +37,6 @@ void WarnManualUpdater() {
   });
 }
 
-void WarnDeprecatedGPUId() {
-  static std::once_flag flag;
-  std::call_once(flag, [] {
-    auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
-    msg += " E.g. device=cpu/cuda/cuda:0";
-    LOG(WARNING) << msg;
-  });
-}
-
 void WarnEmptyDataset() {
   static std::once_flag flag;
   std::call_once(flag,
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index c611683fc68b..158ef9981c4f 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -87,8 +87,6 @@ void WarnDeprecatedGPUHist();
 
 void WarnManualUpdater();
 
-void WarnDeprecatedGPUId();
-
 void WarnEmptyDataset();
 
 [[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
diff --git a/src/context.cc b/src/context.cc
index d4fd21acf694..ca67e90a8b84 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -244,26 +244,13 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
   auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
                                 [](auto const& p) { return p.first == "gpu_id"; });
   auto has_gpu_id = gpu_id_it != kwargs.cend();
-  auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
-                                [](auto const& p) { return p.first == kDevice; });
-  auto has_device = device_it != kwargs.cend();
-  if (has_device && has_gpu_id) {
-    LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
-  }
-
   if (has_gpu_id) {
-    // Compatible with XGBoost < 2.0.0
-    error::WarnDeprecatedGPUId();
-    auto opt_id = ParseInt(StringView{gpu_id_it->second});
-    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
-    if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
-      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
-    } else {
-      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
-    }
-    return;
+    LOG(FATAL) << "`gpu_id` has been removed since 3.1. Use `device` instead.";
   }
 
+  auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == kDevice; });
+  auto has_device = device_it != kwargs.cend();
   auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
 
   if (!has_device) {
diff --git a/src/learner.cc b/src/learner.cc
index d45b533396db..d185262dead8 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -709,7 +709,6 @@ class LearnerConfiguration : public Learner {
     // FIXME(trivialfis): Make eval_metric a training parameter.
     keys.emplace_back(kEvalMetric);
     keys.emplace_back("num_output_group");
-    keys.emplace_back("gpu_id");  // deprecated param.
 
     std::sort(keys.begin(), keys.end());
 
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index a4314e79880a..f398c89f70fc 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -183,11 +183,7 @@ TEST(GBTree, ChooseTreeMethod) {
     }
     if (device.has_value()) {
       auto const& d = device.value();
-      if (std::isdigit(d.front()) || d.front() == '-') {
-        learner->SetParam("gpu_id", d);
-      } else {
-        learner->SetParam("device", d);
-      }
+      learner->SetParam("device", d);
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
@@ -207,11 +203,7 @@ TEST(GBTree, ChooseTreeMethod) {
     }
     if (device.has_value()) {
       auto const& d = device.value();
-      if (std::isdigit(d.front()) || d.front() == '-') {
-        learner->SetParam("gpu_id", d);
-      } else {
-        learner->SetParam("device", d);
-      }
+      learner->SetParam("device", d);
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
@@ -249,29 +241,21 @@ TEST(GBTree, ChooseTreeMethod) {
   std::map<std::pair<std::optional<std::string>, std::optional<std::string>>, std::string>
       expectation{
           // hist
-          {{"hist", "-1"}, "grow_quantile_histmaker"},
-          {{"hist", "0"}, "grow_gpu_hist"},
           {{"hist", "cpu"}, "grow_quantile_histmaker"},
           {{"hist", "cuda"}, "grow_gpu_hist"},
           {{"hist", "cuda:0"}, "grow_gpu_hist"},
           {{"hist", std::nullopt}, "grow_quantile_histmaker"},
           // gpu_hist
-          {{"gpu_hist", "-1"}, "grow_gpu_hist"},
-          {{"gpu_hist", "0"}, "grow_gpu_hist"},
           {{"gpu_hist", "cpu"}, "grow_gpu_hist"},
           {{"gpu_hist", "cuda"}, "grow_gpu_hist"},
           {{"gpu_hist", "cuda:0"}, "grow_gpu_hist"},
           {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
           // exact
-          {{"exact", "-1"}, "grow_colmaker,prune"},
-          {{"exact", "0"}, "err"},
           {{"exact", "cpu"}, "grow_colmaker,prune"},
           {{"exact", "cuda"}, "err"},
           {{"exact", "cuda:0"}, "err"},
           {{"exact", std::nullopt}, "grow_colmaker,prune"},
           // NA
-          {{std::nullopt, "-1"}, "grow_quantile_histmaker"},
-          {{std::nullopt, "0"}, "grow_gpu_hist"},  // default to hist
           {{std::nullopt, "cpu"}, "grow_quantile_histmaker"},
           {{std::nullopt, "cuda"}, "grow_gpu_hist"},
           {{std::nullopt, "cuda:0"}, "grow_gpu_hist"},
diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py
index 073b0de53468..b1faf6ffc016 100644
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -64,3 +64,7 @@ def test_invalid_gpu_id(self) -> None:
         )
         with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
             cls2.fit(X, y)
+
+        clf = xgb.XGBClassifier(tree_method="hist", gpu_id=0)
+        with pytest.raises(ValueError, match="`gpu_id` has been removed"):
+            clf.fit(X, y)

From 44f34dbcf828113c8f58bfa1ae81791167e1ed7e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 2 Jul 2025 06:08:57 +0800
Subject: [PATCH 083/224] Avoid implicit synchronization in evaluation.
 (#11542)

---
 src/tree/gpu_hist/evaluate_splits.cu | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 191344193f10..1f323085e1dc 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -114,7 +114,7 @@ class EvaluateSplitAgent {
     for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
       GradientPairInt64 bin = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x)
-                                              : GradientPairInt64();
+                                            : GradientPairInt64();
 #if CUB_VERSION >= 300000
       BlockScanT(temp_storage->scan).ExclusiveScan(bin, bin, cuda::std::plus{}, prefix_op);
 #else
@@ -142,6 +142,8 @@ class EvaluateSplitAgent {
         best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
                            false, param, rounding);
       }
+
+      __syncwarp();
     }
   }
 
@@ -172,6 +174,8 @@ class EvaluateSplitAgent {
         best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir,
                               static_cast<bst_cat_t>(fvalue), fidx, left, right, param, rounding);
       }
+
+      __syncwarp();
     }
   }
   /**
@@ -181,7 +185,7 @@ class EvaluateSplitAgent {
                                                   bool missing_left, bst_bin_t it,
                                                   GradientPairInt64 const &left_sum,
                                                   GradientPairInt64 const &right_sum,
-                                                  DeviceSplitCandidate *best_split) {
+                                                  DeviceSplitCandidate *__restrict__ best_split) {
     auto gain = thread_active
                     ? evaluator.CalcSplitGain(param, nidx, fidx, rounding.ToFloatingPoint(left_sum),
                                               rounding.ToFloatingPoint(right_sum))
@@ -200,11 +204,13 @@ class EvaluateSplitAgent {
       best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left_sum,
                             right_sum, param, rounding);
     }
+
+    __syncwarp();
   }
   /**
    * \brief Partition-based split for categorical feature.
    */
-  __device__ __forceinline__ void Partition(DeviceSplitCandidate *best_split,
+  __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
                                             common::Span<bst_feature_t> sorted_idx,
                                             std::size_t node_offset,
                                             GPUTrainingParam const &param) {

From 4a78c3af232e573527dabde232512a0e41e85c3f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 2 Jul 2025 16:47:15 +0800
Subject: [PATCH 084/224] [pyspark] Validate the validation indicator column
 type. (#11535)

---
 python-package/xgboost/spark/core.py          |  6 ++++++
 .../test_with_spark/test_spark_local.py       | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 5b5efd377137..47f9cd0079d6 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -51,6 +51,7 @@
 from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
 from pyspark.sql.types import (
     ArrayType,
+    BooleanType,
     DoubleType,
     FloatType,
     IntegerType,
@@ -843,6 +844,11 @@ def _prepare_input(self, dataset: DataFrame) -> Tuple[DataFrame, FeatureProp]:
         sc = _get_spark_session().sparkContext
         max_concurrent_tasks = _get_max_num_concurrent_tasks(sc)
 
+        if feature_prop.has_validation_col:
+            dtype = dataset.schema[alias.valid].dataType
+            if not isinstance(dtype, BooleanType):
+                raise TypeError("The validation indicator must be boolean type.")
+
         if num_workers > max_concurrent_tasks:
             get_logger(self.__class__.__name__).warning(
                 "The num_workers %s set for xgboost distributed "
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 6d1ef971da16..66177639466f 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1308,6 +1308,25 @@ def test_classifier_xgb_summary_with_validation(
             atol=1e-3,
         )
 
+    def test_valid_type(self, spark: SparkSession) -> None:
+        # Validation indicator must be boolean.
+        df_train = spark.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, 0),
+                (Vectors.dense(4.0, 5.0, 6.0), 0, 1),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
+            ],
+            ["features", "label", "isVal"],
+        )
+        reg = SparkXGBRegressor(
+            features_col="features",
+            label_col="label",
+            validation_indicator_col="isVal",
+        )
+        with pytest.raises(TypeError, match="The validation indicator must be boolean"):
+            reg.fit(df_train)
+
 
 class XgboostLocalTest(SparkTestCase):
     def setUp(self):

From 20be6669477f6d63ef2c3b95ef91716ecd0ede43 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Jul 2025 19:06:19 +0800
Subject: [PATCH 085/224] [jvm][doc] Fix param name in booster.java. (#11525)

---
 .../src/main/java/ml/dmlc/xgboost4j/java/Booster.java       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 9a1cd15d9aa8..43a2d39855d8 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -194,7 +194,7 @@ public final String[] getFeatureNames() throws XGBoostError {
   /**
    * Set feature names to the Booster.
    *
-   * @param An array of all the feature names.
+   * @param featureNames An array of all the feature names.
    * @throws XGBoostError
    */
   public void setFeatureNames(String[] featureNames) throws XGBoostError {
@@ -216,7 +216,7 @@ public final String[] getFeatureTypes() throws XGBoostError {
 
   /**
    * Set feature types to the Booster.
-   * @param An array of all the feature types.
+   * @param featureTypes An array of all the feature types.
    * @throws XGBoostError
    */
   public void setFeatureTypes(String[] featureTypes) throws XGBoostError {
@@ -633,7 +633,7 @@ public String[] getModelDump(String[] featureNames, boolean withStats) throws XG
    * Get the dump of the model as a string array with specified feature map, stats,
    * and the specified format.
    *
-   * @param featureMap An array of strings containing the feature names.
+   * @param featureNames An array of strings containing the feature names.
    * @param withStats Controls whether the split statistics are output.
    * @param format The format in which the model is dumped (text, json, ubj).
    * @return The dumped model information

From 65ab69ee112296572f46f8c877ea6243e328b9e3 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 3 Jul 2025 08:09:24 -0700
Subject: [PATCH 086/224] [jvm-packages] Fix JVM docs build, by making
 `LabeledPoint` a Java class (#11545)

---
 .../ml/dmlc/xgboost4j/scala/spark/Utils.scala |   4 +-
 .../scala/spark/XGBoostEstimator.scala        |   4 +-
 .../xgboost4j/scala/spark/TrainTestData.scala |  14 +-
 .../java/ml/dmlc/xgboost4j/LabeledPoint.java  | 133 ++++++++++++++++++
 .../ml/dmlc/xgboost4j/LabeledPoint.scala      |  48 -------
 5 files changed, 144 insertions(+), 59 deletions(-)
 create mode 100644 jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/LabeledPoint.java
 delete mode 100644 jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
index 7671dd723242..45d8854d6f3e 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
@@ -52,9 +52,9 @@ private[scala] object Utils {
     // TODO support sparsevector
     def asXGB: XGBLabeledPoint = v match {
       case v: DenseVector =>
-        XGBLabeledPoint(0.0f, v.size, null, v.values.map(_.toFloat))
+        new XGBLabeledPoint(0.0f, v.size, null, v.values.map(_.toFloat))
       case v: SparseVector =>
-        XGBLabeledPoint(0.0f, v.size, v.indices, v.toDense.values.map(_.toFloat))
+        new XGBLabeledPoint(0.0f, v.size, v.indices, v.toDense.values.map(_.toFloat))
     }
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
index a5acf2475977..4b59ce02dd52 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
@@ -261,7 +261,7 @@ private[spark] trait XGBoostEstimator[
           // To make "0" meaningful, we convert sparse vector if possible to dense.
           features.toArray.map(_.toFloat)
       }
-      XGBLabeledPoint(label, values.length, null, values, weight, group, baseMargin)
+      new XGBLabeledPoint(label, values.length, null, values, weight, group, baseMargin)
     }
   }
 
@@ -655,7 +655,7 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML
                 }
               case _ => throw new RuntimeException("Unsupported feature type")
             }
-            XGBLabeledPoint(0.0f, values.size, null, values)
+            new XGBLabeledPoint(0.0f, values.size, null, values)
           }
         })
         // DMatrix used to prediction
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
index b93bba9ef133..6bc40d8a10dc 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
@@ -47,7 +47,7 @@ trait TrainTestData {
         }
       }
 
-      XGBLabeledPoint(label, featureSize, null, values)
+      new XGBLabeledPoint(label, featureSize, null, values)
     }.toList
   }
 
@@ -58,7 +58,7 @@ trait TrainTestData {
       val label = original.head.toFloat
       val group = original.last.toInt
       val values = original.slice(1, length - 1).map(_.toFloat)
-      XGBLabeledPoint(label, values.size, null, values, 1f, group, Float.NaN)
+      new XGBLabeledPoint(label, values.size, null, values, 1f, group, Float.NaN)
     }.toList
   }
 }
@@ -70,7 +70,7 @@ object Classification extends TrainTestData {
   Random.setSeed(10)
   val randomWeights = Array.fill(train.length)(Random.nextFloat())
   val trainWithWeight = train.zipWithIndex.map { case (v, index) =>
-    XGBLabeledPoint(v.label, v.size, v.indices, v.values,
+    new XGBLabeledPoint(v.label, v.size, v.indices, v.values,
       randomWeights(index), v.group, v.baseMargin)
   }
 }
@@ -89,7 +89,7 @@ object MultiClassification extends TrainTestData {
   Random.setSeed(10)
   val randomWeights = Array.fill(train.length)(Random.nextFloat())
   val trainWithWeight = train.zipWithIndex.map { case (v, index) =>
-    XGBLabeledPoint(v.label, v.size, v.indices, v.values,
+    new XGBLabeledPoint(v.label, v.size, v.indices, v.values,
       randomWeights(index), v.group, v.baseMargin)
   }
 
@@ -104,7 +104,7 @@ object MultiClassification extends TrainTestData {
         values(i) = featuresAndLabel(i).toFloat
       }
 
-      XGBLabeledPoint(label, values.length - 1, null, values.take(values.length - 1))
+      new XGBLabeledPoint(label, values.length - 1, null, values.take(values.length - 1))
     }.toList
   }
 }
@@ -119,7 +119,7 @@ object Regression extends TrainTestData {
   Random.setSeed(10)
   val randomWeights = Array.fill(train.length)(Random.nextFloat())
   val trainWithWeight = train.zipWithIndex.map { case (v, index) =>
-    XGBLabeledPoint(v.label, v.size, v.indices, v.values,
+    new XGBLabeledPoint(v.label, v.size, v.indices, v.values,
       randomWeights(index), v.group, v.baseMargin)
   }
 
@@ -128,7 +128,7 @@ object Regression extends TrainTestData {
     val train: Seq[XGBLabeledPoint] = getLabeledPointsWithGroup("/rank.train.csv")
     // use the group as the weight
     val trainWithWeight = train.map { labelPoint =>
-      XGBLabeledPoint(labelPoint.label, labelPoint.size, labelPoint.indices, labelPoint.values,
+      new XGBLabeledPoint(labelPoint.label, labelPoint.size, labelPoint.indices, labelPoint.values,
         labelPoint.group, labelPoint.group, labelPoint.baseMargin)
     }
     val trainGroups = train.map(_.group)
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/LabeledPoint.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/LabeledPoint.java
new file mode 100644
index 000000000000..478583834e3e
--- /dev/null
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/LabeledPoint.java
@@ -0,0 +1,133 @@
+/*
+ Copyright (c) 2014-2025 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Objects;
+
+/**
+ * Labeled training data point.
+ * TODO(hcho3): Migrate Record class when we upgrade to Java 14+, to reduce boilerplate.
+ */
+public final class LabeledPoint implements Serializable {
+  private final float label;
+  private final int size;
+  private final int[] indices;
+  private final float[] values;
+  private final float weight;
+  private final int group;
+  private final float baseMargin;
+
+  /**
+   * @param label Label of this point.
+   * @param size Feature dimensionality
+   * @param indices Feature indices of this point or `null` if the data is dense.
+   * @param values Feature values of this point.
+   * @param weight Weight of this point.
+   * @param group Group of this point (used for ranking) or -1.
+   * @param baseMargin Initial prediction on this point or `Float.NaN`
+   */
+  public LabeledPoint(
+      float label, int size, int[] indices, float[] values, float weight,
+      int group, float baseMargin
+  ) {
+    assert (indices == null || indices.length == values.length):
+      "indices and values must have the same number of elements";
+    assert (indices == null || size >= indices.length):
+      "feature dimensionality must be greater equal than size of indices";
+    this.label = label;
+    this.size = size;
+    this.indices = indices;
+    this.values = values;
+    this.weight = weight;
+    this.group = group;
+    this.baseMargin = baseMargin;
+  }
+
+  /**
+   * @param label Label of this point.
+   * @param size Feature dimensionality
+   * @param indices Feature indices of this point or `null` if the data is dense.
+   * @param values Feature values of this point.
+   */
+  public LabeledPoint(
+      float label, int size, int[] indices, float[] values
+  ) {
+    this(label, size, indices, values, 1.0f, -1, Float.NaN);
+  }
+
+  /**
+   * @param label Label of this point.
+   * @param size Feature dimensionality
+   * @param indices Feature indices of this point or `null` if the data is dense.
+   * @param values Feature values of this point.
+   * @param weight Weight of this point.
+   */
+  public LabeledPoint(
+      float label, int size, int[] indices, float[] values, float weight
+  ) {
+    this(label, size, indices, values, weight, -1, Float.NaN);
+  }
+
+  /**
+   * @param label Label of this point.
+   * @param size Feature dimensionality
+   * @param indices Feature indices of this point or `null` if the data is dense.
+   * @param values Feature values of this point.
+   * @param weight Weight of this point.
+   * @param group Group of this point (used for ranking) or -1.
+   */
+  public LabeledPoint(
+      float label, int size, int[] indices, float[] values, float weight,
+      int group
+  ) {
+    this(label, size, indices, values, weight, group, Float.NaN);
+  }
+
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(this.label, this.size, Arrays.hashCode(this.indices),
+      Arrays.hashCode(this.values), this.weight, this.group, this.baseMargin);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    } else if (!(obj instanceof LabeledPoint)) {
+      return false;
+    } else {
+      LabeledPoint other = (LabeledPoint) obj;
+      return Objects.equals(label, other.label)
+        && Objects.equals(size, other.size)
+        && Arrays.equals(indices, other.indices)
+        && Arrays.equals(values, other.values)
+        && Objects.equals(weight, other.weight)
+        && Objects.equals(group, other.group)
+        && Objects.equals(baseMargin, other.baseMargin);
+    }
+  }
+
+  public float label() { return this.label; }
+  public int size() { return this.size; }
+  public int[] indices() { return this.indices; }
+  public float[] values() { return this.values; }
+  public float weight() { return this.weight; }
+  public int group() { return this.group; }
+  public float baseMargin() { return this.baseMargin; }
+}
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala
deleted file mode 100644
index ccdedbaa3704..000000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j
-
-/**
- * Labeled training data point.
- *
- * @param label Label of this point.
- * @param size Feature dimensionality
- * @param indices Feature indices of this point or `null` if the data is dense.
- * @param values Feature values of this point.
- * @param weight Weight of this point.
- * @param group Group of this point (used for ranking) or -1.
- * @param baseMargin Initial prediction on this point or `Float.NaN`
- */
-case class LabeledPoint(
-    label: Float,
-    size: Int,
-    indices: Array[Int],
-    values: Array[Float],
-    weight: Float = 1f,
-    group: Int = -1,
-    baseMargin: Float = Float.NaN) extends Serializable {
-  require(indices == null || indices.length == values.length,
-    "indices and values must have the same number of elements")
-
-  require(indices == null || size >= indices.length,
-    "feature dimensionality must be greater equal than size of indices")
-
-  def this(label: Float, size: Int, indices: Array[Int], values: Array[Float]) = {
-    // [[weight]] default duplicated to disambiguate the constructor call.
-    this(label, size, indices, values, 1.0f)
-  }
-}

From eabb5edd7bb867f1a417b03aec429617bf8e22e5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 4 Jul 2025 04:39:14 +0800
Subject: [PATCH 087/224] [doc][EM] Add a brief introduction to NUMA. (#11538)

---
 demo/guide-python/distributed_extmem_basic.py |  7 +-
 demo/guide-python/external_memory.py          |  7 +-
 doc/tutorials/external_memory.rst             | 66 +++++++++++++++++--
 3 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/demo/guide-python/distributed_extmem_basic.py b/demo/guide-python/distributed_extmem_basic.py
index 55166a3faeef..35919daff5ef 100644
--- a/demo/guide-python/distributed_extmem_basic.py
+++ b/demo/guide-python/distributed_extmem_basic.py
@@ -13,8 +13,11 @@
 If `device` is `cuda`, following are also needed:
 
 - cupy
-- python-cuda
 - rmm
+- cuda-python
+
+Not shown in this example, but you should pay attention to NUMA configuration as
+discussed in the tutorial.
 
 """
 
@@ -38,7 +41,7 @@
 
 def device_mem_total() -> int:
     """The total number of bytes of memory this GPU has."""
-    from cuda import cudart
+    import cuda.bindings.runtime as cudart
 
     status, free, total = cudart.cudaMemGetInfo()
     if status != cudart.cudaError_t.cudaSuccess:
diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index 7891447938c8..60fd1efead7f 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -23,12 +23,15 @@
 
 - cupy
 - rmm
-- python-cuda
+- cuda-python
 
 .. seealso::
 
   :ref:`sphx_glr_python_examples_distributed_extmem_basic.py`
 
+Not shown in this example, but you should pay attention to NUMA configuration as
+discussed in the tutorial.
+
 """
 
 import argparse
@@ -44,7 +47,7 @@
 
 def device_mem_total() -> int:
     """The total number of bytes of memory this GPU has."""
-    from cuda import cudart
+    import cuda.bindings.runtime as cudart
 
     status, free, total = cudart.cudaMemGetInfo()
     if status != cudart.cudaError_t.cudaSuccess:
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index e75f9429c824..df40fedde386 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -2,6 +2,13 @@
 Using XGBoost External Memory Version
 #####################################
 
+**Contents**
+
+.. contents::
+  :backlinks: none
+  :local:
+
+
 ********
 Overview
 ********
@@ -40,12 +47,6 @@ introduce the difference between CPU and GPU in the following sections.
 The external memory support has undergone multiple development iterations. See below
 sections for a brief history.
 
-**Contents**
-
-.. contents::
-  :backlinks: none
-  :local:
-
 
 *************
 Data Iterator
@@ -273,6 +274,56 @@ with version ``>=565.47`` is required, it should come with CTK 12.7 and later
 versions. Lastly, there's a known issue with Linux 6.11 that can lead to CUDA host memory
 allocation failure with an ``invalid argument`` error.
 
+================================
+Non-Uniform Memory Access (NUMA)
+================================
+
+On multi-socket systems, `NUMA
+<https://en.wikipedia.org/wiki/Non-uniform_memory_access>`__ helps optimize data access by
+prioritizing memory that is local to each socket.  On these systems, it's essential to set
+the correct affinity to reduce the overhead of cross-socket data access. Since the out of
+core training stages the data cache on the host and trains the model using a GPU, the
+training performance is particularly sensitive to the data read bandwidth. To provide some
+context, on a GB200 machine, accessing the wrong NUMA node from a GPU can reduce the C2C
+bandwidth by half. Even if you are not using distributed training, you should still pay
+attention to NUMA control since there's no guarantee that your process will have the
+correct configuration.
+
+We have tested two approaches of NUMA configuration. The first (and recommended) way is to
+use the ``numactl`` command line available on Linux distributions:
+
+.. code-block:: sh
+
+    numactl --membind=${NODEID} --cpunodebind=${NODEID} ./myapp
+
+
+To obtain the node ID, you can check the machine topology via ``nvidia-smi``:
+
+.. code-block:: sh
+
+    nvidia-smi topo -m
+
+The column ``NUMA Affinity`` lists the NUMA node ID for each GPU. In the example output
+shown below, the `GPU0` is associated with the `0` node ID::
+
+            GPU0    GPU1    NIC0    NIC1    NIC2    NIC3    CPU Affinity    NUMA Affinity   GPU NUMA ID
+    GPU0     X      NV18    NODE    NODE    NODE    SYS     0-71            0               2
+    GPU1    NV18     X      SYS     SYS     SYS     NODE    72-143          1               10
+    NIC0    NODE    SYS      X      PIX     NODE    SYS
+    NIC1    NODE    SYS     PIX      X      NODE    SYS
+    NIC2    NODE    SYS     NODE    NODE     X      SYS
+    NIC3    SYS     NODE    SYS     SYS     SYS      X
+
+Another approach is to use the CPU affinity. The `dask-cuda
+<https://github.com/rapidsai/dask-cuda>`__ project configures optimal CPU affinity for the
+Dask interface through using the `nvml` library in addition to the Linux sched
+routines. This can help guide the memory allocation policy but does not enforce it. As a
+result, when the memory is under pressure, the OS can allocate memory on different NUMA
+nodes. On the other hand, it's easier to use since launchers like
+:py:class:`~dask_cuda.LocalCUDACluster` have already integrated the solution.
+
+We use the first approach for benchmarks as it has better enforcement.
+
 ********************
 Distributed Training
 ********************
@@ -431,7 +482,8 @@ undergone multiple development iterations. Here's a brief summary of major chang
   objectives support.
 - In addition, we begin support for distributed training in 3.0
 - 3.1 added support for having divided cache pages. One can have part of a cache page in
-  the GPU and the rest of the cache in the host memory.
+  the GPU and the rest of the cache in the host memory. In addition, XGBoost works with
+  the Grace Blackwell hardware decompression engine when data is sparse.
 
 ****************
 Text File Inputs

From 45def5ccefb68248dcec3d33b5ee41fdfe481d14 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 4 Jul 2025 05:04:19 +0800
Subject: [PATCH 088/224] Support mixed metrics for the skl interface. (#11536)

---
 python-package/xgboost/sklearn.py             | 43 ++++++++++++++++---
 tests/python/test_with_sklearn.py             | 23 ++++++++++
 .../test_with_dask/test_with_dask.py          | 24 +++++++++++
 3 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 86f550298a35..fb2239aaf092 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1,5 +1,6 @@
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-lines
 """Scikit-Learn Wrapper interface for XGBoost."""
+import collections
 import copy
 import json
 import os
@@ -432,7 +433,7 @@ def task(i: int) -> float:
         - ``one_output_per_tree``: One model for each target.
         - ``multi_output_tree``:  Use multi-target trees.
 
-    eval_metric : {Optional[Union[str, List[str], Callable]]}
+    eval_metric : {Optional[Union[str, List[Union[str, Callable]], Callable]]}
 
         .. versionadded:: 1.6.0
 
@@ -763,7 +764,7 @@ def __init__(
         max_cat_to_onehot: Optional[int] = None,
         max_cat_threshold: Optional[int] = None,
         multi_strategy: Optional[str] = None,
-        eval_metric: Optional[Union[str, List[str], Callable]] = None,
+        eval_metric: Optional[Union[str, List[Union[str, Callable]], Callable]] = None,
         early_stopping_rounds: Optional[int] = None,
         callbacks: Optional[List[TrainingCallback]] = None,
         **kwargs: Any,
@@ -1103,14 +1104,42 @@ def _duplicated(parameter: str) -> None:
 
         # - configure callable evaluation metric
         metric: Optional[Metric] = None
+
+        def custom_metric(m: Callable) -> Metric:
+            if self._get_type() == "ranker":
+                wrapped = ltr_metric_decorator(m, self.n_jobs)
+            else:
+                wrapped = _metric_decorator(m)
+            return wrapped
+
+        def invalid_type(m: Any) -> None:
+            msg = f"Invalid type for the `eval_metric`: {type(m)}"
+            raise TypeError(msg)
+
         if self.eval_metric is not None:
             if callable(self.eval_metric):
-                if self._get_type() == "ranker":
-                    metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
-                else:
-                    metric = _metric_decorator(self.eval_metric)
-            else:
+                metric = custom_metric(self.eval_metric)
+            elif isinstance(self.eval_metric, str):
                 params.update({"eval_metric": self.eval_metric})
+            else:
+                # A sequence of metrics
+                if not isinstance(self.eval_metric, collections.abc.Sequence):
+                    invalid_type(self.eval_metric)
+                # Could be a list of strings or callables
+                builtin_metrics: List[str] = []
+                for m in self.eval_metric:
+                    if callable(m):
+                        if metric is not None:
+                            raise NotImplementedError(
+                                "Using multiple custom metrics is not yet supported."
+                            )
+                        metric = custom_metric(m)
+                    elif isinstance(m, str):
+                        builtin_metrics.append(m)
+                    else:
+                        invalid_type(m)
+                if builtin_metrics:
+                    params.update({"eval_metric": builtin_metrics})
 
         if feature_weights is not None:
             _deprecated("feature_weights")
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 57346c00da39..820b891f112b 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1397,6 +1397,29 @@ def merror(y_true: np.ndarray, predt: np.ndarray):
         clf.fit(X, y, eval_set=[(X, y)])
 
 
+def test_mixed_metrics() -> None:
+    from sklearn.datasets import make_classification
+    from sklearn.metrics import hamming_loss, hinge_loss, log_loss
+
+    X, y = make_classification(random_state=2025)
+
+    clf = xgb.XGBClassifier(eval_metric=["logloss", hinge_loss], n_estimators=2)
+    clf.fit(X, y, eval_set=[(X, y)])
+    results = clf.evals_result()["validation_0"]
+    assert "logloss" in results
+    assert "hinge_loss" in results
+
+    clf = xgb.XGBClassifier(eval_metric=[hamming_loss, log_loss], n_estimators=2)
+    with pytest.raises(
+        NotImplementedError, match="multiple custom metrics is not yet supported."
+    ):
+        clf.fit(X, y, eval_set=[(X, y)])
+
+    clf = xgb.XGBClassifier(eval_metric=[123, log_loss], n_estimators=2)
+    with pytest.raises(TypeError, match="Invalid type for the `eval_metric`"):
+        clf.fit(X, y, eval_set=[(X, y)])
+
+
 def test_weighted_evaluation_metric():
     from sklearn.datasets import make_hastie_10_2
     from sklearn.metrics import log_loss
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 78ede7510ad5..92c5781ef242 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1677,6 +1677,30 @@ def sqr(
             results_custom = reg.evals_result()
             tm.non_increasing(results_custom["validation_0"]["rmse"])
 
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_custom_metrics(self, client: "Client") -> None:
+        from sklearn.datasets import make_classification
+        from sklearn.metrics import hamming_loss, hinge_loss, log_loss
+
+        Xn, yn = make_classification(random_state=2025)
+        X, y = da.array(Xn), da.array(yn)
+
+        clf = dxgb.DaskXGBClassifier(
+            eval_metric=["logloss", hinge_loss], n_estimators=2
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
+        results = clf.evals_result()["validation_0"]
+        assert "logloss" in results
+        assert "hinge_loss" in results
+
+        clf = dxgb.DaskXGBClassifier(
+            eval_metric=[hamming_loss, log_loss], n_estimators=2
+        )
+        with pytest.raises(
+            NotImplementedError, match="multiple custom metrics is not yet supported."
+        ):
+            clf.fit(X, y, eval_set=[(X, y)])
+
     def test_no_duplicated_partition(self) -> None:
         """Assert each worker has the correct amount of data, and DMatrix initialization
         doesn't generate unnecessary copies of data.

From 231e4dd5e29fe07740db0a5c258dc7096e1a32c2 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 3 Jul 2025 17:02:07 -0700
Subject: [PATCH 089/224] [CI] [WheelNext] Build variant wheels (#11531)

* [CI] [WheelNext] Build variant wheels

* Fix script

* Create impl script
---
 .github/workflows/python_wheels_variants.yml | 36 ++++++++++++++
 ops/pipeline/build-variant-wheels-impl.sh    | 23 +++++++++
 ops/pipeline/build-variant-wheels.sh         | 51 ++++++++++++++++++++
 python-package/pyproject.toml                |  7 +++
 python-package/pyproject.toml.in             |  7 +++
 5 files changed, 124 insertions(+)
 create mode 100644 .github/workflows/python_wheels_variants.yml
 create mode 100755 ops/pipeline/build-variant-wheels-impl.sh
 create mode 100755 ops/pipeline/build-variant-wheels.sh

diff --git a/.github/workflows/python_wheels_variants.yml b/.github/workflows/python_wheels_variants.yml
new file mode 100644
index 000000000000..4a0e1248d030
--- /dev/null
+++ b/.github/workflows/python_wheels_variants.yml
@@ -0,0 +1,36 @@
+name: Build Python wheels using Wheel Variant prototype (WheelNext)
+
+on: [push, pull_request]
+
+permissions:
+  contents: read  # to fetch code (actions/checkout)
+
+defaults:
+  run:
+    shell: bash -l {0}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  BRANCH_NAME: >-
+    ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+
+jobs:
+  python-wheels-variants:
+    name: Build Python wheels using Wheel Variant prototype (WheelNext)
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-amd64-cpu
+      - tag=python-wheels-variants
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: |
+          bash ops/pipeline/build-variant-wheels.sh
diff --git a/ops/pipeline/build-variant-wheels-impl.sh b/ops/pipeline/build-variant-wheels-impl.sh
new file mode 100755
index 000000000000..d8499f09f8a9
--- /dev/null
+++ b/ops/pipeline/build-variant-wheels-impl.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+## Build Python wheels using Wheel Variant prototype (WheelNext)
+## Companion script for ops/pipeline/build-variant-wheels.sh
+
+set -eo pipefail
+
+set -x
+gosu root chown -R $(id -u):$(id -g) /opt/miniforge/envs /opt/miniforge/pkgs/cache
+gosu root chown $(id -u):$(id -g) /opt/miniforge/pkgs
+set +x
+
+mamba create -y -n wheelnext python=3.13 python-build
+
+source activate wheelnext
+
+# Cannot set -u before Conda env activation
+set -xu
+
+python -m pip install -v \
+  git+https://github.com/wheelnext/pep_xxx_wheel_variants.git@f3b287090f8a6f510b0e1723896e1c7e638f6bff#subdirectory=pep_xxx_wheel_variants
+pip config set --site global.index-url https://variants-index.wheelnext.dev/
+variantlib make-variant -f python-package/dist/xgboost-*.whl \
+  -p "nvidia :: cuda :: 12" -o . --pyproject-toml python-package/pyproject.toml
diff --git a/ops/pipeline/build-variant-wheels.sh b/ops/pipeline/build-variant-wheels.sh
new file mode 100755
index 000000000000..d018e4609cf8
--- /dev/null
+++ b/ops/pipeline/build-variant-wheels.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+## Build Python wheels using Wheel Variant prototype (WheelNext)
+
+set -euo pipefail
+
+if [[ -z "${GITHUB_SHA:-}" ]]
+then
+  echo "Make sure to set environment variable GITHUB_SHA"
+  exit 1
+fi
+
+image_repo='xgb-ci.gpu_build_rockylinux8'
+
+source ops/pipeline/classify-git-branch.sh
+source ops/pipeline/get-docker-registry-details.sh
+source ops/pipeline/get-image-tag.sh
+
+WHEEL_TAG=manylinux_2_28_x86_64
+BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}"
+MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
+
+echo "--- Build with CUDA"
+
+if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
+then
+  export BUILD_ONLY_SM75=1
+else
+  export BUILD_ONLY_SM75=0
+fi
+export USE_RMM=0
+
+set -x
+
+python3 ops/docker_run.py \
+  --image-uri ${BUILD_IMAGE_URI} \
+  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM' \
+  -- ops/pipeline/build-cuda-impl.sh
+
+echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
+python3 ops/docker_run.py \
+  --image-uri ${MANYLINUX_IMAGE_URI} \
+  -- auditwheel repair --only-plat \
+  --plat ${WHEEL_TAG} python-package/dist/*.whl
+python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
+  wheelhouse/*.whl
+mv -v wheelhouse/*.whl python-package/dist/
+
+echo "--- Convert Python wheel to variant wheel"
+python3 ops/docker_run.py \
+  --image-uri ${BUILD_IMAGE_URI} \
+  -- ops/pipeline/build-variant-wheels-impl.sh
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index d188319dd4a0..f89b1c30dace 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -95,3 +95,10 @@ inspect = true
 ignore = ["compiled-objects-have-debug-symbols"]
 max_allowed_size_compressed = '300M'
 max_allowed_size_uncompressed = '500M'
+
+[variant.default-priorities]
+namespace = ["nvidia"]
+
+[variant.providers.nvidia]
+requires = ["nvidia-variant-provider>=0.0.1,<1.0.0"]
+plugin-api = "nvidia_variant_provider.plugin:NvidiaVariantPlugin"
diff --git a/python-package/pyproject.toml.in b/python-package/pyproject.toml.in
index a36ff30791e5..01dff2462e7a 100644
--- a/python-package/pyproject.toml.in
+++ b/python-package/pyproject.toml.in
@@ -94,3 +94,10 @@ inspect = true
 ignore = ["compiled-objects-have-debug-symbols"]
 max_allowed_size_compressed = '300M'
 max_allowed_size_uncompressed = '500M'
+
+[variant.default-priorities]
+namespace = ["nvidia"]
+
+[variant.providers.nvidia]
+requires = ["nvidia-variant-provider>=0.0.1,<1.0.0"]
+plugin-api = "nvidia_variant_provider.plugin:NvidiaVariantPlugin"

From 843d5ee474956c14144d5a41ce76b0abc6e9c1f9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 8 Jul 2025 02:33:37 +0800
Subject: [PATCH 090/224] [EM] Make device cache more conservative. (#11547)

---
 src/data/batch_utils.cc         | 2 +-
 src/data/ellpack_page_source.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/batch_utils.cc b/src/data/batch_utils.cc
index b41e8f7c90b3..6d2f71795ad5 100644
--- a/src/data/batch_utils.cc
+++ b/src/data/batch_utils.cc
@@ -84,7 +84,7 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
     cache_host_ratio = 0.0;
   } else {
     // The number of bytes that must be in the host memory.
-    auto h_cache_nbytes = n_cache_bytes - d_cache_nbytes;
+    auto h_cache_nbytes = n_cache_bytes - d_cache_nbytes * 0.85;
     cache_host_ratio = static_cast<double>(h_cache_nbytes) / static_cast<double>(n_cache_bytes);
     if (lc > 0) {
       // 0 < lc < 10, C2C is available, but with reduced link count.
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 023e7db79c39..f40e27760982 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -181,7 +181,7 @@ class EllpackHostCacheStreamImpl {
       new_impl->CopyInfo(old_impl);
 
       // Split the cache into host cache, compressed host cache, and the device cache. We
-      // use the decompression engin only for sparse data.
+      // use the decompression engine only for sparse data.
       auto n_bytes = get_host_nbytes(old_impl);
       CHECK_LE(n_bytes, old_impl->gidx_buffer.size_bytes());
       std::size_t n_h_bytes = n_bytes, n_comp_bytes = 0;

From 96688a5b28a2c1b719b1d1414554dd9ffd0d2370 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 9 Jul 2025 02:43:27 +0800
Subject: [PATCH 091/224] [EM] Prevent blocking memcpy during prediction.
 (#11548)

---
 include/xgboost/host_device_vector.h |  1 +
 src/predictor/gpu_predictor.cu       | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index d9dfeadbc7eb..1a0da50fa07a 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -101,6 +101,7 @@ class HostDeviceVector {
 
   [[nodiscard]] bool Empty() const { return Size() == 0; }
   [[nodiscard]] std::size_t Size() const;
+  [[nodiscard]] std::size_t SizeBytes() const { return this->Size() * sizeof(T); }
   [[nodiscard]] DeviceOrd Device() const;
   common::Span<T> DeviceSpan();
   common::Span<const T> ConstDeviceSpan() const;
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 67f26b09dec7..8c54662f44ee 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -364,6 +364,20 @@ class DeviceModel {
   int num_group;
   CatContainer const* cat_enc{nullptr};
 
+  [[nodiscard]] std::size_t MemCostBytes() const {
+    std::size_t n_bytes = 0;
+    n_bytes += stats.ConstDeviceSpan().size_bytes();
+    n_bytes += tree_segments.ConstDeviceSpan().size_bytes();
+    n_bytes += nodes.ConstDeviceSpan().size_bytes();
+    n_bytes += tree_group.ConstDeviceSpan().size_bytes();
+    n_bytes += split_types.ConstDeviceSpan().size_bytes();
+    n_bytes += categories_tree_segments.ConstDeviceSpan().size_bytes();
+    n_bytes += categories_node_segments.ConstDeviceSpan().size_bytes();
+    n_bytes += categories.ConstDeviceSpan().size_bytes();
+    n_bytes += sizeof(tree_beg_) + sizeof(tree_end_) + sizeof(num_group) + sizeof(cat_enc);
+    return n_bytes;
+  }
+
   void Init(const gbm::GBTreeModel& model, bst_tree_t tree_begin, bst_tree_t tree_end,
             DeviceOrd device) {
     dh::safe_cuda(cudaSetDevice(device.ordinal));
@@ -438,6 +452,9 @@ class DeviceModel {
 
     this->cat_enc = model.Cats();
     CHECK(this->cat_enc);
+
+    auto n_bytes = this->MemCostBytes();  // Pull data to device, and get the size of the model.
+    LOG(DEBUG) << "Model size:" << common::HumanMemUnit(n_bytes);
   }
 };
 

From 14eb4455ee106b94ad0805237eb42c9a1fafca2a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Jul 2025 11:20:46 +0800
Subject: [PATCH 092/224] Remove cpp tests for deprecated binary model.
 (#11552)

---
 src/tree/updater_sync.cc          | 40 +++++++------
 tests/cpp/test_learner.cc         | 34 +----------
 tests/cpp/tree/test_tree_model.cc | 96 -------------------------------
 3 files changed, 23 insertions(+), 147 deletions(-)

diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index 6526e519c9d5..fd1eb943781e 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -1,17 +1,17 @@
 /**
- * Copyright 2014-2024, XBGoost Contributors
+ * Copyright 2014-2025, XBGoost Contributors
  * \file updater_sync.cc
  * \brief synchronize the tree in all distributed nodes
  */
-#include <xgboost/tree_updater.h>
-
 #include <string>
 #include <vector>
 
-#include "../collective/broadcast.h"
-#include "../collective/communicator-inl.h"
-#include "../common/io.h"
-#include "xgboost/json.h"
+#include "../collective/broadcast.h"         // for Broadcast
+#include "../collective/communicator-inl.h"  // for GetRank, GetWorldSize
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/json.h"                    // for Json, Object
+#include "xgboost/linalg.h"                  // for Matrix
+#include "xgboost/tree_updater.h"            // for TreeUpdater
 
 namespace xgboost::tree {
 
@@ -23,31 +23,35 @@ DMLC_REGISTRY_FILE_TAG(updater_sync);
  */
 class TreeSyncher : public TreeUpdater {
  public:
-  explicit TreeSyncher(Context const* tparam) : TreeUpdater(tparam) {}
-  void Configure(const Args&) override {}
+  explicit TreeSyncher(Context const* tparam) : TreeUpdater{tparam} {}
+  void Configure(Args const&) override {}
 
   void LoadConfig(Json const&) override {}
   void SaveConfig(Json*) const override {}
 
-  [[nodiscard]] char const* Name() const override { return "prune"; }
+  [[nodiscard]] char const* Name() const override { return "sync"; }
 
   void Update(TrainParam const*, linalg::Matrix<GradientPair>*, DMatrix*,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree*>& trees) override {
-    if (collective::GetWorldSize() == 1) return;
-    std::string s_model;
-    common::MemoryBufferStream fs(&s_model);
-    int rank = collective::GetRank();
+    if (collective::GetWorldSize() == 1) {
+      return;
+    }
+
+    Json model{Object{}};
+    auto rank = collective::GetRank();
     if (rank == 0) {
       for (auto tree : trees) {
-        tree->Save(&fs);
+        tree->SaveModel(&model);
       }
     }
-    fs.Seek(0);
-    auto rc = collective::Broadcast(ctx_, linalg::MakeVec(s_model.data(), s_model.size()), 0);
+    std::vector<char> jmodel;
+    Json::Dump(model, &jmodel, std::ios::binary);
+    auto rc = collective::Broadcast(ctx_, linalg::MakeVec(jmodel.data(), jmodel.size()), 0);
     SafeColl(rc);
+
     for (auto tree : trees) {
-      tree->Load(&fs);
+      tree->LoadModel(model);
     }
   }
 };
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index fa012bd184a4..6ed1bb2a8f9c 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -283,38 +283,6 @@ TEST(Learner, MultiThreadedPredict) {
   }
 }
 
-TEST(Learner, BinaryModelIO) {
-  size_t constexpr kRows = 8;
-  int32_t constexpr kIters = 4;
-  auto p_dmat = RandomDataGenerator{kRows, 10, 0}.GenerateDMatrix();
-  p_dmat->Info().labels.Reshape(kRows);
-
-  std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
-  learner->SetParam("eval_metric", "rmsle");
-  learner->Configure();
-  for (int32_t iter = 0; iter < kIters; ++iter) {
-    learner->UpdateOneIter(iter, p_dmat);
-  }
-  dmlc::TemporaryDirectory tempdir;
-  std::string const fname = tempdir.path + "binary_model_io.bin";
-  {
-    // Make sure the write is complete before loading.
-    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
-    learner->SaveModel(fo.get());
-  }
-
-  learner.reset(Learner::Create({p_dmat}));
-  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
-  learner->LoadModel(fi.get());
-  learner->Configure();
-  Json config { Object() };
-  learner->SaveConfig(&config);
-  std::string config_str;
-  Json::Dump(config, &config_str);
-  ASSERT_NE(config_str.find("rmsle"), std::string::npos);
-  ASSERT_EQ(config_str.find("WARNING"), std::string::npos);
-}
-
 #if defined(XGBOOST_USE_CUDA)
 // Tests for automatic GPU configuration.
 TEST(Learner, GPUConfiguration) {
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index ac3444f75c33..93b8a0236581 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -16,20 +16,6 @@ TEST(Tree, ModelShape) {
   RegTree tree{1u, n_features};
   ASSERT_EQ(tree.NumFeatures(), n_features);
 
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/tree.model";
-  {
-    // binary dump
-    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmp_file.c_str(), "w"));
-    tree.Save(fo.get());
-  }
-  {
-    // binary load
-    RegTree new_tree;
-    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(tmp_file.c_str(), "r"));
-    new_tree.Load(fi.get());
-    ASSERT_EQ(new_tree.NumFeatures(), n_features);
-  }
   {
     // json
     Json j_tree{Object{}};
@@ -56,88 +42,6 @@ TEST(Tree, ModelShape) {
   }
 }
 
-#if DMLC_IO_NO_ENDIAN_SWAP  // skip on big-endian machines
-// Manually construct tree in binary format
-// Do not use structs in case they change
-// We want to preserve backwards compatibility
-TEST(Tree, Load) {
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/tree.model";
-  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmp_file.c_str(), "w"));
-
-  // Write params
-  EXPECT_EQ(sizeof(TreeParam), (31 + 6) * sizeof(int));
-  int num_roots = 1;
-  int num_nodes = 2;
-  int num_deleted = 0;
-  int max_depth = 1;
-  int num_feature = 0;
-  int size_leaf_vector = 0;
-  int reserved[31];
-  fo->Write(&num_roots, sizeof(int));
-  fo->Write(&num_nodes, sizeof(int));
-  fo->Write(&num_deleted, sizeof(int));
-  fo->Write(&max_depth, sizeof(int));
-  fo->Write(&num_feature, sizeof(int));
-  fo->Write(&size_leaf_vector, sizeof(int));
-  fo->Write(reserved, sizeof(int) * 31);
-
-  // Write 2 nodes
-  EXPECT_EQ(sizeof(RegTree::Node),
-            3 * sizeof(int) + 1 * sizeof(unsigned) + sizeof(float));
-  int parent = -1;
-  int cleft = 1;
-  int cright = -1;
-  unsigned sindex = 5;
-  float split_or_weight = 0.5;
-  fo->Write(&parent, sizeof(int));
-  fo->Write(&cleft, sizeof(int));
-  fo->Write(&cright, sizeof(int));
-  fo->Write(&sindex, sizeof(unsigned));
-  fo->Write(&split_or_weight, sizeof(float));
-  parent = 0;
-  cleft = -1;
-  cright = -1;
-  sindex = 2;
-  split_or_weight = 0.1;
-  fo->Write(&parent, sizeof(int));
-  fo->Write(&cleft, sizeof(int));
-  fo->Write(&cright, sizeof(int));
-  fo->Write(&sindex, sizeof(unsigned));
-  fo->Write(&split_or_weight, sizeof(float));
-
-  // Write 2x node stats
-  EXPECT_EQ(sizeof(RTreeNodeStat), 3 * sizeof(float) + sizeof(int));
-  bst_float loss_chg = 5.0;
-  bst_float sum_hess = 1.0;
-  bst_float base_weight = 3.0;
-  int leaf_child_cnt = 0;
-  fo->Write(&loss_chg, sizeof(float));
-  fo->Write(&sum_hess, sizeof(float));
-  fo->Write(&base_weight, sizeof(float));
-  fo->Write(&leaf_child_cnt, sizeof(int));
-
-  loss_chg = 50.0;
-  sum_hess = 10.0;
-  base_weight = 30.0;
-  leaf_child_cnt = 0;
-  fo->Write(&loss_chg, sizeof(float));
-  fo->Write(&sum_hess, sizeof(float));
-  fo->Write(&base_weight, sizeof(float));
-  fo->Write(&leaf_child_cnt, sizeof(int));
-  fo.reset();
-  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(tmp_file.c_str(), "r"));
-
-  xgboost::RegTree tree;
-  tree.Load(fi.get());
-  EXPECT_EQ(tree.GetDepth(1), 1);
-  EXPECT_EQ(tree[0].SplitCond(), 0.5f);
-  EXPECT_EQ(tree[0].SplitIndex(), 5ul);
-  EXPECT_EQ(tree[1].LeafValue(), 0.1f);
-  EXPECT_TRUE(tree[1].IsLeaf());
-}
-#endif  // DMLC_IO_NO_ENDIAN_SWAP
-
 TEST(Tree, AllocateNode) {
   RegTree tree;
   tree.ExpandNode(0, 0, 0.0f, false, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,

From 9ffb6ff815f22fe38f306ff6296f5b7011de99b3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Jul 2025 11:21:23 +0800
Subject: [PATCH 093/224] Drop the deprecated binary model for gblinear.
 (#11553)

---
 src/gbm/gblinear.cc      |  8 ++++----
 src/gbm/gblinear_model.h | 15 ++-------------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 5c12da24af70..70d3fe6aeb81 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -104,11 +104,11 @@ class GBLinear : public GradientBooster {
 
   bool ModelFitted() const override { return BoostedRounds() != 0; }
 
-  void Load(dmlc::Stream* fi) override {
-    model_.Load(fi);
+  void Load(dmlc::Stream*) override {
+    LOG(FATAL) << "The deprecated binary model has been removed";
   }
-  void Save(dmlc::Stream* fo) const override {
-    model_.Save(fo);
+  void Save(dmlc::Stream*) const override {
+    LOG(FATAL) << "The deprecated binary model has been removed";
   }
 
   void SaveModel(Json* p_out) const override {
diff --git a/src/gbm/gblinear_model.h b/src/gbm/gblinear_model.h
index 91760346ca47..76b73677a0d1 100644
--- a/src/gbm/gblinear_model.h
+++ b/src/gbm/gblinear_model.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2019 by Contributors
+/**
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #pragma once
 #include <dmlc/io.h>
@@ -71,17 +71,6 @@ class GBLinearModel : public Model {
   void SaveModel(Json *p_out) const override;
   void LoadModel(Json const &in) override;
 
-  // save the model to file
-  void Save(dmlc::Stream *fo) const {
-    fo->Write(&param_, sizeof(param_));
-    fo->Write(weight);
-  }
-  // load model from file
-  void Load(dmlc::Stream *fi) {
-    CHECK_EQ(fi->Read(&param_, sizeof(param_)), sizeof(param_));
-    fi->Read(&weight);
-  }
-
   // model bias
   inline bst_float *Bias() {
     return &weight[learner_model_param->num_feature *

From 5ff47c12f924a97a7b5cfc21f71306decdc38f15 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Jul 2025 12:54:23 +0800
Subject: [PATCH 094/224] Remove the deprecated `gpu_hist`. (#11549)

---
 include/xgboost/context.h                     |  8 ---
 include/xgboost/gbm.h                         |  4 --
 .../scala/spark/GpuXGBoostPlugin.scala        |  2 +-
 .../scala/spark/GpuXGBoostPluginSuite.scala   |  4 --
 .../scala/spark/XGBoostEstimator.scala        |  2 +-
 .../spark/params/TreeBoosterParams.scala      |  2 +-
 .../scala/spark/XGBoostEstimatorSuite.scala   | 10 ----
 python-package/xgboost/sklearn.py             |  2 +-
 python-package/xgboost/spark/core.py          |  6 +--
 src/context.cc                                |  6 ---
 src/gbm/gblinear.cc                           | 21 --------
 src/gbm/gbtree.cc                             | 19 -------
 src/gbm/gbtree.h                              |  9 ++--
 src/learner.cc                                |  3 --
 tests/cpp/gbm/test_gbtree.cc                  | 31 ++++-------
 tests/python-gpu/test_gpu_pickling.py         | 54 ++++++++++---------
 .../test_with_spark/test_spark_local.py       | 11 ++--
 17 files changed, 58 insertions(+), 136 deletions(-)

diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index 639f0f173a55..5a1c58042b24 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -162,14 +162,6 @@ struct Context : public XGBoostParameter<Context> {
   bool fail_on_invalid_gpu_id{false};
   bool validate_parameters{false};
 
-  /**
-   * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
-   *        removed.
-   *
-   * @param require_gpu Whether GPU is explicitly required by the user through other
-   *                    configurations.
-   */
-  void ConfigureGpuId(bool require_gpu);
   /**
    * @brief Returns the automatically chosen number of threads based on the `nthread`
    *        parameter and the system settting.
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 3f4e8540efa5..6c90c2a2e0bd 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -158,10 +158,6 @@ class GradientBooster : public Model, public Configurable {
                             common::Span<int32_t const> trees,
                             std::vector<bst_feature_t>* features,
                             std::vector<float>* scores) const = 0;
-  /**
-   * @brief Whether the current booster uses GPU.
-   */
-  [[nodiscard]] virtual bool UseGPU() const = 0;
   /*!
    * \brief create a gradient booster from given name
    * \param name name of gradient booster
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
index afbf15066d43..d428e2782e51 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
@@ -102,7 +102,7 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
   private[spark] def validate[T <: XGBoostEstimator[T, M], M <: XGBoostModel[M]](
       estimator: XGBoostEstimator[T, M],
       dataset: Dataset[_]): Unit = {
-    require(estimator.getTreeMethod == "gpu_hist" || estimator.getDevice != "cpu",
+    require(estimator.getDevice != "cpu",
       "Using Spark-Rapids to accelerate XGBoost must set device=cuda")
   }
 
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
index ab8cf66220b2..e3d1925bebee 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
@@ -129,10 +129,6 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
 
       classifier.setDevice("gpu")
       plugin.validate(classifier, df)
-
-      classifier.setDevice("cpu")
-      classifier.setTreeMethod("gpu_hist")
-      plugin.validate(classifier, df)
     }
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
index 4b59ce02dd52..2e09d18869d5 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
@@ -374,7 +374,7 @@ private[spark] trait XGBoostEstimator[
 
   private[spark] def getRuntimeParameters(isLocal: Boolean,
       configs: Map[String, AnyRef] = Map.empty): RuntimeParams = {
-    val runOnGpu = if (getDevice != "cpu" || getTreeMethod == "gpu_hist") true else false
+    val runOnGpu = if (getDevice != "cpu") true else false
     RuntimeParams(
       getNumWorkers,
       getNumRound,
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala
index 208ba1bf6346..afad45437396 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala
@@ -231,7 +231,7 @@ private[spark] trait TreeBoosterParams extends Params {
 
 private[spark] object BoosterParams {
 
-  val supportedTreeMethods = HashSet("auto", "exact", "approx", "hist", "gpu_hist")
+  val supportedTreeMethods = HashSet("auto", "exact", "approx", "hist")
 
   val supportedUpdaters = HashSet("grow_colmaker", "grow_histmaker", "grow_quantile_histmaker",
     "grow_gpu_hist", "grow_gpu_approx", "sync", "refresh", "prune")
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
index 455001096c21..b9bbec0ee5cb 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
@@ -133,16 +133,6 @@ class XGBoostEstimatorSuite extends AnyFunSuite with PerTest with TmpFolderPerSu
       Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1)
       .getRuntimeParameters(true)
     assert(runtimeParams.runOnGpu)
-
-    runtimeParams = new XGBoostClassifier(
-      Map("device" -> "cpu", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1)
-      .getRuntimeParameters(true)
-    assert(runtimeParams.runOnGpu)
-
-    runtimeParams = new XGBoostClassifier(
-      Map("device" -> "cuda", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1)
-      .getRuntimeParameters(true)
-    assert(runtimeParams.runOnGpu)
   }
 
   test("missing value exception for sparse vector") {
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index fb2239aaf092..1522dd7e8d57 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -85,7 +85,7 @@ def _check_rf_callback(
 
 def _can_use_qdm(tree_method: Optional[str], device: Optional[str]) -> bool:
     not_sycl = (device is None) or (not device.startswith("sycl"))
-    return tree_method in ("hist", "gpu_hist", None, "auto") and not_sycl
+    return tree_method in ("hist", None, "auto") and not_sycl
 
 
 class _SklObjWProto(Protocol):
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 47f9cd0079d6..3bc45144db6a 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -503,10 +503,8 @@ def _validate_params(self) -> None:
     def _run_on_gpu(self) -> bool:
         """If train or transform on the gpu according to the parameters"""
 
-        return (
-            use_cuda(self.getOrDefault(self.device))
-            or self.getOrDefault(self.use_gpu)
-            or self.getOrDefault(self.getParam("tree_method")) == "gpu_hist"
+        return use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(
+            self.use_gpu
         )
 
     def _col_is_defined_not_empty(self, param: "Param[str]") -> bool:
diff --git a/src/context.cc b/src/context.cc
index ca67e90a8b84..ae1874b02e4a 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -234,12 +234,6 @@ void Context::Init(Args const& kwargs) {
   }
 }
 
-void Context::ConfigureGpuId(bool require_gpu) {
-  if (this->IsCPU() && require_gpu) {
-    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
-  }
-}
-
 void Context::SetDeviceOrdinal(Args const& kwargs) {
   auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
                                 [](auto const& p) { return p.first == "gpu_id"; });
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 70d3fe6aeb81..3099fed69856 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -13,8 +13,6 @@
 #include <string>
 #include <vector>
 
-#include "../common/common.h"
-#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../common/error_msg.h"      // NoCategorical, DeprecatedFunc
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
@@ -36,16 +34,6 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
   float tolerance;
   size_t max_row_perbatch;
 
-  void CheckGPUSupport() {
-    auto n_gpus = curt::AllVisibleGPUs();
-    if (n_gpus == 0 && this->updater == "gpu_coord_descent") {
-      common::AssertGPUSupport();
-      this->UpdateAllowUnknown(Args{{"updater", "coord_descent"}});
-      LOG(WARNING) << "Loading configuration on a CPU only machine.   Changing "
-                      "updater to `coord_descent`.";
-    }
-  }
-
   DMLC_DECLARE_PARAMETER(GBLinearTrainParam) {
     DMLC_DECLARE_FIELD(updater)
         .set_default("shotgun")
@@ -128,7 +116,6 @@ class GBLinear : public GradientBooster {
   void LoadConfig(Json const& in) override {
     CHECK_EQ(get<String>(in["name"]), "gblinear");
     FromJson(in["gblinear_train_param"], &param_);
-    param_.CheckGPUSupport();
     updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
     this->updater_->LoadConfig(in["updater"]);
   }
@@ -252,14 +239,6 @@ class GBLinear : public GradientBooster {
     }
   }
 
-  [[nodiscard]] bool UseGPU() const override {
-    if (param_.updater == "gpu_coord_descent") {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
  protected:
   void PredictBatchInternal(DMatrix *p_fmat,
                             std::vector<bst_float> *out_preds) {
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index ae0b5b5020f1..d63bbc9a3583 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -62,11 +62,6 @@ std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method)
     case TreeMethod::kExact:
       CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
       return "grow_colmaker,prune";
-    case TreeMethod::kGPUHist: {
-      common::AssertGPUSupport();
-      error::WarnDeprecatedGPUHist();
-      return "grow_gpu_hist";
-    }
     default:
       auto tm = static_cast<std::underlying_type_t<TreeMethod>>(tree_method);
       LOG(FATAL) << "Unknown tree_method: `" << tm << "`.";
@@ -357,20 +352,6 @@ void GBTree::LoadConfig(Json const& in) {
   tparam_.process_type = TreeProcessType::kDefault;
   std::int32_t const n_gpus = curt::AllVisibleGPUs();
 
-  auto msg = StringView{
-      R"(
-  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
-  machine. Consider using `save_model/load_model` instead. See:
-
-    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
-
-  for more details about differences between saving model and serializing.)"};
-
-  if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) {
-    tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}});
-    LOG(WARNING) << msg << "  Changing `tree_method` to `hist`.";
-  }
-
   std::vector<Json> updater_seq;
   if (IsA<Object>(in["updater"])) {
     // before 2.0
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 1fbf0ebdaf7f..180c3b3988b5 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -32,8 +32,10 @@
 
 namespace xgboost {
 enum class TreeMethod : int {
-  kAuto = 0, kApprox = 1, kExact = 2, kHist = 3,
-  kGPUHist = 5
+  kAuto = 0,
+  kApprox = 1,
+  kExact = 2,
+  kHist = 3,
 };
 
 // boosting process types
@@ -71,7 +73,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
         .add_enum("approx",    TreeMethod::kApprox)
         .add_enum("exact",     TreeMethod::kExact)
         .add_enum("hist",      TreeMethod::kHist)
-        .add_enum("gpu_hist",  TreeMethod::kGPUHist)
         .describe("Choice of tree construction method.");
   }
 };
@@ -184,8 +185,6 @@ class GBTree : public GradientBooster {
   void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry* predt,
                ObjFunction const* obj) override;
 
-  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
-
   [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
 
   void Load(dmlc::Stream* fi) override { model_.Load(fi); }
diff --git a/src/learner.cc b/src/learner.cc
index d185262dead8..c99d95edce9a 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -484,7 +484,6 @@ class LearnerConfiguration : public Learner {
 
     learner_model_param_.task = obj_->Task();  // required by gbm configuration.
     this->ConfigureGBM(old_tparam, args);
-    ctx_.ConfigureGpuId(this->gbm_->UseGPU());
 
     this->ConfigureModelParamWithoutBaseScore();
 
@@ -555,8 +554,6 @@ class LearnerConfiguration : public Learner {
     }
 
     FromJson(learner_parameters.at("generic_param"), &ctx_);
-    // make sure the GPU ID is valid in new environment before start running configure.
-    ctx_.ConfigureGpuId(false);
 
     this->need_configuration_ = true;
   }
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index f398c89f70fc..80f217ac2505 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -218,26 +218,17 @@ TEST(GBTree, ChooseTreeMethod) {
     return updater;
   };
 
-  // |        | hist    | gpu_hist | exact | NA  |
-  // |--------+---------+----------+-------+-----|
-  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU |
-  // | CPU    | CPU     | GPU (w)  | CPU   | CPU |
-  // |--------+---------+----------+-------+-----|
-  // | -1     | CPU     | GPU (w)  | CPU   | CPU |
-  // | 0      | GPU     | GPU (w)  | Err   | GPU |
-  // |--------+---------+----------+-------+-----|
-  // | NA     | CPU     | GPU (w)  | CPU   | CPU |
+  // |        | hist    | approx | exact | NA  |
+  // |--------+---------+--------+-------+-----|
+  // | CUDA:0 | GPU     | GPU    | Err   | GPU |
+  // | CPU    | CPU     | GPU    | CPU   | CPU |
+  // |--------+---------+--------+-------+-----|
+  // | NA     | CPU     | CPU    | CPU   | CPU |
   //
-  // - (w): warning
   // - CPU: Run on CPU.
   // - GPU: Run on CUDA.
   // - Err: Not feasible.
   // - NA:  Parameter is not specified.
-
-  // When GPU hist is specified with a CPU context, we should emit an error. However, it's
-  // quite difficult to detect whether the CPU context is being used because it's the
-  // default or because it's specified by the user.
-
   std::map<std::pair<std::optional<std::string>, std::optional<std::string>>, std::string>
       expectation{
           // hist
@@ -245,11 +236,11 @@ TEST(GBTree, ChooseTreeMethod) {
           {{"hist", "cuda"}, "grow_gpu_hist"},
           {{"hist", "cuda:0"}, "grow_gpu_hist"},
           {{"hist", std::nullopt}, "grow_quantile_histmaker"},
-          // gpu_hist
-          {{"gpu_hist", "cpu"}, "grow_gpu_hist"},
-          {{"gpu_hist", "cuda"}, "grow_gpu_hist"},
-          {{"gpu_hist", "cuda:0"}, "grow_gpu_hist"},
-          {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
+          // approx
+          {{"approx", "cpu"}, "grow_histmaker"},
+          {{"approx", "cuda"}, "grow_gpu_approx"},
+          {{"approx", "cuda:0"}, "grow_gpu_approx"},
+          {{"approx", std::nullopt}, "grow_histmaker"},
           // exact
           {{"exact", "cpu"}, "grow_colmaker,prune"},
           {{"exact", "cuda"}, "err"},
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
index e35842d0e619..f971416d1d51 100644
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -3,6 +3,7 @@
 import os
 import pickle
 import subprocess
+from typing import Any, Dict
 
 import numpy as np
 import pytest
@@ -111,37 +112,42 @@ def test_pickled_context(self):
         x, y = tm.make_sparse_regression(10, 10, sparsity=0.8, as_dense=True)
         train_x = xgb.DMatrix(x, label=y)
 
-        param = {"tree_method": "hist", "verbosity": 1, "device": "cuda"}
-        bst = xgb.train(param, train_x)
+        def run_test(param: Dict[str, Any]) -> None:
+            bst = xgb.train(param, train_x)
 
-        save_pickle(bst, model_path)
+            save_pickle(bst, model_path)
 
-        args = self.args_template.copy()
-        root = tm.project_root(__file__)
-        path = os.path.join(root, "tests", "python-gpu", "load_pickle.py")
-        args.append(path + "::TestLoadPickle::test_context_is_removed")
+            args = self.args_template.copy()
+            root = tm.project_root(__file__)
+            path = os.path.join(root, "tests", "python-gpu", "load_pickle.py")
+            args.append(path + "::TestLoadPickle::test_context_is_removed")
 
-        cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
-        env = os.environ.copy()
-        env.update(cuda_environment)
+            cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
+            env = os.environ.copy()
+            env.update(cuda_environment)
 
-        # Load model in a CPU only environment.
-        status = subprocess.call(args, env=env)
-        assert status == 0
+            # Load model in a CPU only environment.
+            status = subprocess.call(args, env=env)
+            assert status == 0
 
-        args = self.args_template.copy()
-        args.append(
-            "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_context_is_preserved"
-        )
+            args = self.args_template.copy()
+            args.append(
+                "./tests/python-gpu/"
+                "load_pickle.py::TestLoadPickle::test_context_is_preserved"
+            )
 
-        # Load in environment that has GPU.
-        env = os.environ.copy()
-        assert "CUDA_VISIBLE_DEVICES" not in env.keys()
-        status = subprocess.call(args, env=env)
-        assert status == 0
+            # Load in environment that has GPU.
+            env = os.environ.copy()
+            assert "CUDA_VISIBLE_DEVICES" not in env.keys()
+            status = subprocess.call(args, env=env)
+            assert status == 0
 
-        os.remove(model_path)
+            os.remove(model_path)
+
+        param = {"tree_method": "hist", "verbosity": 1, "device": "cuda"}
+        run_test(param)
+        param = {"booster": "gblinear", "updater": "coord_descent", "device": "cuda"}
+        run_test(param)
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_predict_sklearn_pickle(self) -> None:
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 66177639466f..4561edb47cd9 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -921,12 +921,12 @@ def test_device_param(self, reg_data: RegData, clf_data: ClfData) -> None:
         with pytest.raises(ValueError, match="not supported for distributed"):
             regressor.fit(reg_data.reg_df_train)
 
-        reg = SparkXGBRegressor(device="cuda", tree_method="gpu_hist")
+        reg = SparkXGBRegressor(device="cuda", tree_method="hist")
         reg._validate_params()
         reg = SparkXGBRegressor(device="cuda")
         reg._validate_params()
 
-        clf = SparkXGBClassifier(device="cuda", tree_method="gpu_hist")
+        clf = SparkXGBClassifier(device="cuda", tree_method="approx")
         clf._validate_params()
         clf = SparkXGBClassifier(device="cuda")
         clf._validate_params()
@@ -941,12 +941,15 @@ def test_gpu_params(self) -> None:
         clf = SparkXGBClassifier(device="cuda")
         assert clf._run_on_gpu()
 
-        clf = SparkXGBClassifier(tree_method="gpu_hist")
-        assert clf._run_on_gpu()
+        clf = SparkXGBClassifier(tree_method="hist")
+        assert not clf._run_on_gpu()
 
         clf = SparkXGBClassifier(use_gpu=True)
         assert clf._run_on_gpu()
 
+        clf = SparkXGBClassifier(device="cuda", tree_method="approx")
+        assert clf._run_on_gpu()
+
     def test_gpu_transform(self, clf_data: ClfData) -> None:
         """local mode"""
         classifier = SparkXGBClassifier(device="cpu")

From 9c0efcee38450e786d349a1ec558ca453c6df927 Mon Sep 17 00:00:00 2001
From: imeyer2 <130110431+imeyer2@users.noreply.github.com>
Date: Thu, 10 Jul 2025 03:18:53 -0700
Subject: [PATCH 095/224] Add `enable_categorical` to the apply method (#11550)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/sklearn.py |  1 +
 tests/python/test_with_sklearn.py | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 1522dd7e8d57..e8e4fdbd8311 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1416,6 +1416,7 @@ def apply(
                 missing=self.missing,
                 feature_types=self.feature_types,
                 nthread=self.n_jobs,
+                enable_categorical=self.enable_categorical,
             )
             return self.get_booster().predict(
                 test_dmatrix, pred_leaf=True, iteration_range=iteration_range
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 820b891f112b..583da9ebf500 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1567,3 +1567,23 @@ def test_doc_link() -> None:
         name = est.__class__.__name__
         link = est._get_doc_link()
         assert f"xgboost.{name}" in link
+
+
+def test_apply_method():
+    import pandas as pd
+
+    X_num = np.random.rand(5, 5)
+    df = pd.DataFrame(X_num, columns=[f"f{i}" for i in range(X_num.shape[1])])
+    df["test"] = pd.Series(
+        ["one", "two", "three", "four", "five"], dtype="category"
+    )  # <- categorical column
+    y = np.arange(len(df))
+
+    model = xgb.XGBClassifier(enable_categorical=True)
+    model.fit(df, y)
+
+    model.apply(df)  # this must not raise
+
+    model.set_params(enable_categorical=False)
+    with pytest.raises(ValueError, match="`enable_categorical`"):
+        model.apply(df)

From ad29598bbb289b077b38b32e24b3a1881520dabd Mon Sep 17 00:00:00 2001
From: newyork_loki <137760120+lowkeyrossi@users.noreply.github.com>
Date: Mon, 14 Jul 2025 17:12:57 +0530
Subject: [PATCH 096/224] Fix compilation on WoA (#11559)

---
 src/common/bitfield.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/bitfield.h b/src/common/bitfield.h
index 6ecd7fcdf5a0..08a385a2cf00 100644
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -272,7 +272,7 @@ inline std::uint32_t TrailingZeroBits(std::uint32_t value) {
   }
 #if defined(__GNUC__)
   return __builtin_ctz(value);
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && defined(_M_X64)
   return _tzcnt_u32(value);
 #else
   return detail::TrailingZeroBitsImpl(value);

From 70e47c007687ae92e335021be1d9c859950b3898 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 15 Jul 2025 01:47:20 +0800
Subject: [PATCH 097/224] [enc] Add tests for re-coding validation datasets.
 (#11561)

---
 ops/script/lint_python.py                 |   1 +
 python-package/xgboost/callback.py        |   4 +-
 python-package/xgboost/core.py            |   4 +-
 python-package/xgboost/testing/ordinal.py | 130 ++++++++++++++--------
 src/data/adapter.h                        |   2 +-
 tests/python-gpu/test_gpu_ordinal.py      |   9 +-
 tests/python/test_ordinal.py              |   5 +
 7 files changed, 105 insertions(+), 50 deletions(-)

diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py
index 54ea0e570916..dd9740062f26 100644
--- a/ops/script/lint_python.py
+++ b/ops/script/lint_python.py
@@ -108,6 +108,7 @@ class LintersPaths:
         "tests/python/test_interaction_constraints.py",
         "tests/python-gpu/test_gpu_callbacks.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/python-gpu/test_gpu_ordinal.py",
         "tests/python-gpu/load_pickle.py",
         "tests/python-gpu/test_gpu_training_continuation.py",
         "tests/python-gpu/test_gpu_plotting.py",
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index abc42781a9ea..a39b25869e63 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -15,6 +15,7 @@
     Optional,
     Sequence,
     Tuple,
+    TypeAlias,
     TypeVar,
     Union,
     cast,
@@ -54,7 +55,8 @@ class TrainingCallback(ABC):
 
     """
 
-    EvalsLog = Dict[str, Dict[str, _ScoreList]]  # pylint: disable=invalid-name
+    # pylint: disable=invalid-name
+    EvalsLog: TypeAlias = Dict[str, Dict[str, _ScoreList]]
 
     def __init__(self) -> None:
         pass
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 3101fe8c3f1d..f7c18dfe05de 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1289,8 +1289,8 @@ def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
         return indptr, data
 
     def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
-        """Get the categories in the dataset. Return `None` if there's no categorical
-        features.
+        """Get the categories in the dataset using `pyarrow`. Returns `None` if there's
+        no categorical features.
 
         .. warning::
 
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 9d5afc4eed31..7e76c7a751f0 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pytest
 
+from ..callback import TrainingCallback
 from ..compat import import_cupy
 from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
 from ..data import _lazy_load_cudf_is_cat
@@ -429,16 +430,70 @@ def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
     )
 
 
+# pylint: disable=too-many-locals
+def make_recoded(device: Literal["cpu", "cuda"]) -> Tuple:
+    """Synthesize a test dataset with changed encoding."""
+    Df, _ = get_df_impl(device)
+
+    import pandas as pd
+
+    # Test large column numbers. XGBoost makes some specializations for slim datasets,
+    # make sure we cover all the cases.
+    n_features = 4096
+    n_samples = 1024
+
+    # Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
+    old_cats = ["a", "b", "c", "d"]
+    new_cats = ["b", "a", "c", "d"]
+    mapping = {0: 1, 1: 0}
+
+    rng = np.random.default_rng(2025)
+
+    col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
+    col_categorical = rng.integers(
+        low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
+    )
+
+    df = {}  # avoid fragmentation warning from pandas
+    for c in range(n_features):
+        if c % 2 == 0:
+            col = col_numeric[:, c // 2]
+        else:
+            codes = col_categorical[:, c // 2]
+            col = pd.Categorical.from_codes(
+                categories=old_cats,
+                codes=codes,
+            )
+        df[f"f{c}"] = col
+
+    enc = Df(df)
+    y = rng.normal(size=n_samples)
+
+    reenc = enc.copy()
+    for c in range(n_features):
+        if c % 2 == 0:
+            continue
+
+        name = f"f{c}"
+        codes_ser = reenc[name].cat.codes
+        if hasattr(codes_ser, "to_pandas"):  # cudf
+            codes_ser = codes_ser.to_pandas()
+        new_codes = codes_ser.replace(mapping)
+        reenc[name] = pd.Categorical.from_codes(categories=new_cats, codes=new_codes)
+    reenc = Df(reenc)
+    assert (reenc.iloc[:, 1].cat.codes != enc.iloc[:, 1].cat.codes).any()
+    return enc, reenc, y, col_numeric, col_categorical
+
+
 def run_specified_cat(  # pylint: disable=too-many-locals
     device: Literal["cpu", "cuda"],
 ) -> None:
     """Run with manually specified category encoding."""
     import pandas as pd
 
-    # Same between old and new, wiht 0 ("a") and 1 ("b") exchanged their position.
+    # Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
     old_cats = ["a", "b", "c", "d"]
     new_cats = ["b", "a", "c", "d"]
-    mapping = {0: 1, 1: 0}
 
     col0 = np.arange(0, 9)
     col1 = pd.Categorical.from_codes(
@@ -468,57 +523,23 @@ def run_specified_cat(  # pylint: disable=too-many-locals
         predt2 = booster.inplace_predict(df1)
         assert_allclose(device, predt0, predt2)
 
-    # Test large column numbers. XGBoost makes some specializations for slim datasets,
-    # make sure we cover all the cases.
-    n_features = 4096
-    n_samples = 1024
-
-    col_numeric = rng.uniform(0, 1, size=(n_samples, n_features // 2))
-    col_categorical = rng.integers(
-        low=0, high=4, size=(n_samples, n_features // 2), dtype=np.int32
-    )
-
-    df = {}  # avoid fragmentation warning from pandas
-    for c in range(n_features):
-        if c % 2 == 0:
-            col = col_numeric[:, c // 2]
-        else:
-            codes = col_categorical[:, c // 2]
-            col = pd.Categorical.from_codes(
-                categories=old_cats,
-                codes=codes,
-            )
-        df[f"f{c}"] = col
+    enc, reenc, y, col_numeric, col_categorical = make_recoded(device)
 
-    df = Df(df)
-    y = rng.normal(size=n_samples)
-
-    Xy = DMatrix(df, y, enable_categorical=True)
+    Xy = DMatrix(enc, y, enable_categorical=True)
     booster = train({"device": device}, Xy)
 
     predt0 = booster.predict(Xy)
-    predt1 = booster.inplace_predict(df)
+    predt1 = booster.inplace_predict(enc)
     assert_allclose(device, predt0, predt1)
 
-    for c in range(n_features):
-        if c % 2 == 0:
-            continue
-
-        name = f"f{c}"
-        codes_ser = df[name].cat.codes
-        if hasattr(codes_ser, "to_pandas"):  # cudf
-            codes_ser = codes_ser.to_pandas()
-        new_codes = codes_ser.replace(mapping)
-        df[name] = pd.Categorical.from_codes(categories=new_cats, codes=new_codes)
-
-    df = Df(df)
-    Xy = DMatrix(df, y, enable_categorical=True)
+    Xy = DMatrix(reenc, y, enable_categorical=True)
     predt2 = booster.predict(Xy)
     assert_allclose(device, predt0, predt2)
 
-    array = np.empty(shape=(n_samples, n_features))
-    array[:, np.arange(0, n_features) % 2 == 0] = col_numeric
-    array[:, np.arange(0, n_features) % 2 != 0] = col_categorical
+    array = np.empty(shape=(reenc.shape[0], reenc.shape[1]))
+
+    array[:, enc.dtypes == "category"] = col_categorical
+    array[:, enc.dtypes != "category"] = col_numeric
 
     if device == "cuda":
         import cupy as cp
@@ -527,3 +548,24 @@ def run_specified_cat(  # pylint: disable=too-many-locals
 
     predt3 = booster.inplace_predict(array)
     assert_allclose(device, predt0, predt3)
+
+
+def run_validation(device: Literal["cpu", "cuda"]) -> None:
+    """CHeck the validation dataset is using the correct encoding."""
+    enc, reenc, y, _, _ = make_recoded(device)
+
+    Xy = DMatrix(enc, y, enable_categorical=True)
+    Xy_valid = DMatrix(reenc, y, enable_categorical=True)
+
+    evals_result: TrainingCallback.EvalsLog = {}
+    train(
+        {"device": device},
+        Xy,
+        evals=[(Xy, "Train"), (Xy_valid, "Valid")],
+        evals_result=evals_result,
+    )
+
+    # Evaluation dataset should have the exact same performance as the training dataset.
+    assert_allclose(
+        device, evals_result["Train"]["rmse"], evals_result["Valid"]["rmse"]
+    )
diff --git a/src/data/adapter.h b/src/data/adapter.h
index f0ededaf06e5..eb81a82b25d0 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -525,7 +525,7 @@ template <typename CategoricalIndex, bool allow_mask>
 /**
  * @brief Adapter for columnar format (arrow).
  *
- *   Supports for both numeric values and categorical values.
+ *   Supports both numeric values and categorical values.
  */
 class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch> {
   std::vector<ArrayInterface<1>> columns_;
diff --git a/tests/python-gpu/test_gpu_ordinal.py b/tests/python-gpu/test_gpu_ordinal.py
index 063f4c602fb7..909a05a8137b 100644
--- a/tests/python-gpu/test_gpu_ordinal.py
+++ b/tests/python-gpu/test_gpu_ordinal.py
@@ -18,6 +18,7 @@
     run_cat_shap,
     run_cat_thread_safety,
     run_specified_cat,
+    run_validation,
 )
 
 pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_cudf()))
@@ -60,7 +61,7 @@ def test_mixed_devices() -> None:
     n_features = 4
     X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cpu")
 
-    def run_cpu_gpu(DMatrixT: Type):
+    def run_cpu_gpu(DMatrixT: Type) -> bool:
         Xy = DMatrixT(X, y, enable_categorical=True)
         booster = xgb.train({"tree_method": "hist", "device": "cuda"}, Xy)
         predt0 = booster.inplace_predict(X)
@@ -83,7 +84,7 @@ def run_cpu_gpu(DMatrixT: Type):
 
     X, y = make_categorical(n_samples, n_features, 7, onehot=False, device="cuda")
 
-    def run_gpu_cpu(DMatrixT: Type):
+    def run_gpu_cpu(DMatrixT: Type) -> bool:
         Xy = DMatrixT(X, y, enable_categorical=True)
         booster = xgb.train({"tree_method": "hist", "device": "cpu"}, Xy)
         predt0 = booster.inplace_predict(X).get()
@@ -104,3 +105,7 @@ def run_gpu_cpu(DMatrixT: Type):
 
 def test_spcified_cat() -> None:
     run_specified_cat("cuda")
+
+
+def test_validation() -> None:
+    run_validation("cuda")
diff --git a/tests/python/test_ordinal.py b/tests/python/test_ordinal.py
index a9a1af8467b5..d958ae186186 100644
--- a/tests/python/test_ordinal.py
+++ b/tests/python/test_ordinal.py
@@ -11,6 +11,7 @@
     run_cat_shap,
     run_cat_thread_safety,
     run_specified_cat,
+    run_validation,
 )
 
 pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_arrow(), tm.no_pandas()))
@@ -50,3 +51,7 @@ def test_cat_leaf() -> None:
 
 def test_spcified_cat() -> None:
     run_specified_cat("cpu")
+
+
+def test_validation() -> None:
+    run_validation("cpu")

From e0d72ac0906d46238fd32e2b02c31fe08b2901a3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 15 Jul 2025 13:36:03 +0800
Subject: [PATCH 098/224] [EM] Check whether memory policy is set. (#11556)

---
 doc/tutorials/external_memory.rst   |  34 +++++
 python-package/xgboost/core.py      |   2 +
 src/common/error_msg.cc             |  14 +-
 src/common/error_msg.h              |   9 +-
 src/common/io.cc                    |  44 +++----
 src/common/numa_topo.cc             | 191 ++++++++++++++++++++++++++++
 src/common/numa_topo.h              |  53 ++++++++
 src/data/ellpack_page_raw_format.cu |   4 +-
 src/data/ellpack_page_source.h      |  26 ++--
 tests/cpp/common/test_numa_topo.cc  | 152 ++++++++++++++++++++++
 10 files changed, 486 insertions(+), 43 deletions(-)
 create mode 100644 src/common/numa_topo.cc
 create mode 100644 src/common/numa_topo.h
 create mode 100644 tests/cpp/common/test_numa_topo.cc

diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index df40fedde386..6b2060d97657 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -274,6 +274,33 @@ with version ``>=565.47`` is required, it should come with CTK 12.7 and later
 versions. Lastly, there's a known issue with Linux 6.11 that can lead to CUDA host memory
 allocation failure with an ``invalid argument`` error.
 
+.. _extmem-adaptive-cache:
+
+==============
+Adaptive Cache
+==============
+
+Starting with 3.1, XGBoost introduces an adaptive cache for GPU-based external memory
+training. The feature helps split the data cache into a host cache and a device cache. By
+keeping a portion of the cache on the GPU, we can reduce the amount of data transfer
+during training when there's sufficient amount of GPU memory. The feature can be
+controlled by the ``cache_host_ratio`` parameter in the
+:py:class:`xgboost.ExtMemQuantileDMatrix`. It is disabled when the device has full C2C
+bandwidth since it's not needed there. On devices that with reduced bandwidth or devices
+with PCIe connections, unless explicitly specified, the ratio is automatically estimated
+based on device memory size and the size of the dataset.
+
+However, this parameter increases memory fragmentation as XGBoost needs large memory pages
+with irregular sizes. As a result, you might see out of memory error after the
+construction of the ``DMatrix`` but before the actual training begins.
+
+For reference, we tested the adaptive cache with a 128GB (512 features) dense 32bit
+floating dataset using a NVIDIA A6000 GPU, which comes with 48GB device memory. The
+``cache_host_ratio`` was estimated to be about 0.3, meaning about 30 percent of the
+quantized cache was on the host and rest of 70 percent was actually in-core. Given this
+ratio, the overhead is minimal. However, the estimated ratio increases as the data size
+grows.
+
 ================================
 Non-Uniform Memory Access (NUMA)
 ================================
@@ -314,6 +341,13 @@ shown below, the `GPU0` is associated with the `0` node ID::
     NIC2    NODE    SYS     NODE    NODE     X      SYS
     NIC3    SYS     NODE    SYS     SYS     SYS      X
 
+Alternatively, one can also use the ``hwloc`` command line interface, please make sure the
+strict flag is used:
+
+.. code-block:: sh
+
+    hwloc-bind --strict --membind node:${NODEID} --cpubind node:${NODEID} ./myapp
+
 Another approach is to use the CPU affinity. The `dask-cuda
 <https://github.com/rapidsai/dask-cuda>`__ project configures optimal CPU affinity for the
 Dask interface through using the `nvml` library in addition to the Linux sched
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index f7c18dfe05de..1798a61e8190 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1841,6 +1841,8 @@ def __init__(  # pylint: disable=super-init-not-called
             parameter specifies the size of host cache compared to the size of the
             entire cache: :math:`host / (host + device)`.
 
+            See :ref:`extmem-adaptive-cache` for more info.
+
         """
         self.max_bin = max_bin
         self.missing = missing if missing is not None else np.nan
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 4890938f6808..f68cca3b8895 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -1,12 +1,14 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #include "error_msg.h"
 
-#include <mutex>    // for call_once, once_flag
-#include <sstream>  // for stringstream
+#include <mutex>         // for call_once, once_flag
+#include <sstream>       // for stringstream
+#include <system_error>  // for error_code, system_category
 
 #include "../collective/communicator-inl.h"  // for GetRank
+#include "xgboost/collective/socket.h"       // for LastError
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/logging.h"
 
@@ -76,4 +78,10 @@ void CheckOldNccl(std::int32_t major, std::int32_t minor, std::int32_t patch) {
     LOG(WARNING) << msg();
   }
 }
+
+[[nodiscard]] std::error_code SystemError() {
+  std::int32_t errsv = system::LastError();
+  auto err = std::error_code{errsv, std::system_category()};
+  return err;
+}
 }  // namespace xgboost::error
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 158ef9981c4f..6efa56ec24b0 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -6,9 +6,10 @@
 #ifndef XGBOOST_COMMON_ERROR_MSG_H_
 #define XGBOOST_COMMON_ERROR_MSG_H_
 
-#include <cstdint>    // for uint64_t
-#include <limits>     // for numeric_limits
-#include <string>     // for string
+#include <cstdint>       // for uint64_t
+#include <limits>        // for numeric_limits
+#include <string>        // for string
+#include <system_error>  // for error_code
 
 #include "xgboost/base.h"     // for bst_feature_t
 #include "xgboost/context.h"  // for Context
@@ -140,5 +141,7 @@ constexpr StringView CacheHostRatioNotImpl() {
 constexpr StringView CacheHostRatioInvalid() {
   return "`cache_host_ratio` must be in range [0, 1].";
 }
+
+[[nodiscard]] std::error_code SystemError();
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/io.cc b/src/common/io.cc
index 3808098255f6..f187d54e0e88 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,6 +1,7 @@
 /**
  * Copyright 2019-2025, by XGBoost Contributors
  */
+#include "error_msg.h"
 #if defined(__unix__) || defined(__APPLE__)
 
 #include <fcntl.h>     // for open, O_RDONLY, posix_fadvise
@@ -29,12 +30,10 @@
 #include <iterator>      // for distance
 #include <memory>        // for unique_ptr, make_unique
 #include <string>        // for string
-#include <system_error>  // for error_code, system_category
 #include <utility>       // for move
 #include <vector>        // for vector
 
 #include "io.h"
-#include "xgboost/collective/socket.h"  // for LastError
 #include "xgboost/logging.h"            // for CHECK_LE
 #include "xgboost/string_view.h"        // for StringView
 
@@ -134,19 +133,13 @@ std::size_t GetMmapAlignment() {
   return getpagesize();
 #endif
 }
-
-auto SystemErrorMsg() {
-  std::int32_t errsv = system::LastError();
-  auto err = std::error_code{errsv, std::system_category()};
-  return err.message();
-}
 }  // anonymous namespace
 
 std::vector<char> LoadSequentialFile(std::string uri) {
   auto OpenErr = [&uri]() {
     std::string msg;
     msg = "Opening " + uri + " failed: ";
-    msg += SystemErrorMsg();
+    msg += error::SystemError().message();
     LOG(FATAL) << msg;
   };
 
@@ -193,10 +186,11 @@ MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t len
 #if defined(xgboost_IS_WIN)
   HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
                          FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
-  CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
+  CHECK_NE(fd, INVALID_HANDLE_VALUE)
+      << "Failed to open:" << path << ". " << error::SystemError().message();
 #else
   auto fd = open(path.c_str(), O_RDONLY);
-  CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
+  CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << error::SystemError().message();
 #endif
 
   std::byte* ptr{nullptr};
@@ -207,7 +201,7 @@ MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t len
 #if defined(__linux__) || defined(__GLIBC__)
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << error::SystemError().message();
   auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
 #elif defined(xgboost_IS_WIN)
   auto file_size = GetFileSize(fd, nullptr);
@@ -216,16 +210,16 @@ MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t len
   access = FILE_MAP_READ;
   std::uint32_t loff = static_cast<std::uint32_t>(view_start);
   std::uint32_t hoff = view_start >> 32;
-  CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  CHECK(map_file) << "Failed to map: " << path << ". " << error::SystemError().message();
   ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
-  CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << error::SystemError().message();
   auto handle = new MMAPFile{fd, map_file, ptr, view_size, offset - view_start, std::move(path)};
 #else
   CHECK_LE(offset, std::numeric_limits<off_t>::max())
       << "File size has exceeded the limit on the current system.";
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << error::SystemError().message();
   auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
 #endif  // defined(__linux__) || defined(__GLIBC__)
 
@@ -238,22 +232,24 @@ void detail::CloseMmap(MMAPFile* handle) {
   }
 #if defined(xgboost_IS_WIN)
   if (handle->base_ptr) {
-    CHECK(UnmapViewOfFile(handle->base_ptr)) << "Failed to call munmap: " << SystemErrorMsg();
+    CHECK(UnmapViewOfFile(handle->base_ptr))
+        << "Failed to call munmap: " << error::SystemError().message();
   }
   if (handle->fd != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << SystemErrorMsg();
+    CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << error::SystemError().message();
   }
   if (handle->file_map != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
+    CHECK(CloseHandle(handle->file_map))
+        << "Failed to close mapping object: " << error::SystemError().message();
   }
 #else
   if (handle->base_ptr) {
     CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1)
-        << "Failed to call munmap: `" << handle->path << "`. " << SystemErrorMsg();
+        << "Failed to call munmap: `" << handle->path << "`. " << error::SystemError().message();
   }
   if (handle->fd != 0) {
     CHECK_NE(close(handle->fd), -1)
-        << "Failed to close: `" << handle->path << "`. " << SystemErrorMsg();
+        << "Failed to close: `" << handle->path << "`. " << error::SystemError().message();
   }
 #endif
   delete handle;
@@ -293,7 +289,7 @@ std::shared_ptr<MallocResource> MemBufFileReadStream::ReadFileIntoBuffer(StringV
   std::unique_ptr<FILE, std::function<int(FILE*)>> fp{fopen(path.c_str(), "rb"), fclose};
 
   auto err = [&] {
-    auto e = SystemErrorMsg();
+    auto e = error::SystemError().message();
     LOG(FATAL) << "Failed to read file `" << path << "`. System error message: " << e;
   };
 #if defined(__linux__)
@@ -302,7 +298,7 @@ std::shared_ptr<MallocResource> MemBufFileReadStream::ReadFileIntoBuffer(StringV
     err();
   }
   if (posix_fadvise(fd, offset, length, POSIX_FADV_SEQUENTIAL) != 0) {
-    LOG(FATAL) << SystemErrorMsg();
+    LOG(FATAL) << error::SystemError().message();
   }
 #endif  // defined(__linux__)
 
@@ -358,12 +354,12 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
 [[nodiscard]] std::size_t TotalMemory() {
 #if defined(__linux__)
   struct sysinfo info;
-  CHECK_EQ(sysinfo(&info), 0) << SystemErrorMsg();
+  CHECK_EQ(sysinfo(&info), 0) << error::SystemError().message();
   return info.totalram * info.mem_unit;
 #elif defined(xgboost_IS_WIN)
   MEMORYSTATUSEX status;
   status.dwLength = sizeof(status);
-  CHECK(GlobalMemoryStatusEx(&status)) << SystemErrorMsg();
+  CHECK(GlobalMemoryStatusEx(&status)) << error::SystemError().message();
   return static_cast<std::size_t>(status.ullTotalPhys);
 #else
   LOG(FATAL) << "Not implemented";
diff --git a/src/common/numa_topo.cc b/src/common/numa_topo.cc
new file mode 100644
index 000000000000..755364623669
--- /dev/null
+++ b/src/common/numa_topo.cc
@@ -0,0 +1,191 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include "numa_topo.h"
+
+#if defined(__linux__)
+
+#include <linux/mempolicy.h>  // for MPOL_BIND
+#include <sys/syscall.h>      // for SYS_get_mempolicy
+#include <unistd.h>           // for syscall
+
+#endif  // defined(__linux__)
+
+#include <cctype>      // for isalnum
+#include <cstddef>     // for size_t
+#include <cstdint>     // for int32_t
+#include <filesystem>  // for path
+#include <fstream>     // for ifstream
+#include <string>      // for string, stoi
+#include <vector>      // for vector
+
+#include "common.h"     // for TrimLast, TrimFirst
+#include "error_msg.h"  // for SystemError
+#include "xgboost/logging.h"
+
+namespace xgboost::common {
+
+namespace {
+namespace fs = std::filesystem;
+
+using MaskT = unsigned long;  // NOLINT
+inline constexpr std::size_t kMaskBits = sizeof(MaskT) * 8;
+
+#if defined(__linux__)
+// Wrapper for the system call.
+//
+// https://github.com/torvalds/linux/blob/3f31a806a62e44f7498e2d17719c03f816553f11/mm/mempolicy.c#L1075
+auto GetMemPolicy(int *mode, MaskT *nodemask, unsigned long maxnode, void *addr,  // NOLINT
+                  unsigned long flags) {                                          // NOLINT
+  return syscall(SYS_get_mempolicy, mode, nodemask, maxnode, addr, flags);
+}
+
+auto GetMemPolicy(int *policy, MaskT *nodemask, unsigned long maxnode) {  // NOLINT
+  return GetMemPolicy(policy, nodemask, maxnode, nullptr, 0);
+}
+#endif  // defined(__linux__)
+}  // namespace
+
+void ReadCpuList(fs::path const &path, std::vector<std::int32_t> *p_cpus) {
+  auto &cpus = *p_cpus;
+  cpus.clear();
+
+  std::string buff;
+  std::ifstream fin{path};
+  fin >> buff;
+  if (fin.fail()) {
+    LOG(WARNING) << "Failed to read: " << path;
+    return;
+  }
+
+  CHECK(!buff.empty());
+  buff = common::TrimFirst(common::TrimLast(buff));
+
+  std::int32_t k = 0;
+  CHECK(std::isalnum(buff[k]));
+  while (static_cast<std::size_t>(k) < buff.size()) {
+    std::int32_t val0 = -1, val1 = -1;
+    std::size_t idx = 0;
+    CHECK(std::isalnum(buff[k])) << k << " " << buff;
+    val0 = std::stoi(buff.data() + k, &idx);
+    auto last = k + idx;
+    CHECK_LE(last, buff.size());
+    k = last + 1;  // new begin
+    if (last == buff.size() || buff[last] != '-') {
+      // Single value
+      cpus.push_back(val0);
+      continue;
+    }
+    CHECK_EQ(buff[last], '-') << last;
+
+    idx = -1;
+    CHECK_LT(k, buff.size());
+    val1 = std::stoi(buff.data() + k, &idx);
+    CHECK_GE(idx, 1);
+    // Range
+    for (auto i = val0; i <= val1; ++i) {
+      cpus.push_back(i);
+    }
+    k += (idx + 1);
+  }
+}
+
+void GetNumaNodeCpus(std::int32_t node_id, std::vector<std::int32_t> *p_cpus) {
+  p_cpus->clear();
+#if defined(__linux__)
+  std::string nodename = "node" + std::to_string(node_id);
+  auto p_cpulist = fs::path{"/sys/devices/system/node"} / nodename / "cpulist";  // NOLINT
+
+  if (!fs::exists(p_cpulist)) {
+    return;
+  }
+  ReadCpuList(p_cpulist, p_cpus);
+#endif  // defined(__linux__)
+}
+
+[[nodiscard]] std::int32_t GetNumaMaxNumNodes() {
+#if defined(__linux__)
+  auto p_possible = fs::path{"/sys/devices/system/node/possible"};
+
+  std::int32_t max_n_nodes = kMaskBits;
+
+  if (fs::exists(p_possible)) {
+    std::vector<std::int32_t> cpus;
+    ReadCpuList(p_possible, &cpus);
+    auto it = std::max_element(cpus.cbegin(), cpus.cend());
+    // +1 since node/CPU uses 0-based indexing.
+    if (it != cpus.cend() && (*it + 1) > max_n_nodes) {
+      max_n_nodes = (*it + 1);
+    }
+  }
+
+  // Just in case if it keeps getting into error
+  constexpr decltype(max_n_nodes) kStop = 16384;
+  // Estimate the size of the CPU set based on the error returned from get mempolicy.
+  // Strategy used by hwloc and libnuma.
+  while (true) {
+    std::vector<MaskT> mask(max_n_nodes / kMaskBits, 0);
+
+    std::int32_t mode = -1;
+    auto err = GetMemPolicy(&mode, mask.data(), max_n_nodes);
+    if (!err || errno != EINVAL) {
+      return max_n_nodes;  // Got it.
+    }
+    max_n_nodes *= 2;
+
+    if (max_n_nodes > kStop) {
+      break;
+    }
+  }
+#endif  // defined(__linux__)
+  return -1;
+}
+
+[[nodiscard]] bool GetNumaMemBind() {
+#if defined(__linux__)
+  std::int32_t mode = -1;
+  auto max_n_nodes = GetNumaMaxNumNodes();
+  if (max_n_nodes <= 0) {
+    return false;  // Sth went wrong, assume there's no membind.
+  }
+  CHECK_GE(max_n_nodes, kMaskBits);
+  std::vector<MaskT> mask(max_n_nodes / kMaskBits);
+  auto status = GetMemPolicy(&mode, mask.data(), max_n_nodes);
+  if (status < 0) {
+    auto msg = error::SystemError().message();
+    LOG(WARNING) << msg;
+    return false;
+  }
+  return mode == MPOL_BIND;
+#else
+  return false;
+#endif  // defined(__linux__)
+}
+
+[[nodiscard]] std::int32_t GetNumaNumNodes() {
+#if defined(__linux__)
+  fs::path p_node{"/sys/devices/system/node"};
+  if (!fs::exists(p_node)) {
+    return -1;
+  }
+  try {
+    std::int32_t n_nodes{0};
+    for (auto const &entry : fs::directory_iterator{p_node}) {
+      auto name = entry.path().filename().string();
+      if (name.find("node") == 0) {  // starts with `node`
+        n_nodes += 1;
+      }
+    }
+    if (n_nodes == 0) {
+      // Something went wrong, we should have at lease 1 node.
+      LOG(WARNING) << "Failed to list NUMA nodes.";
+      return -1;
+    }
+    return n_nodes;
+  } catch (std::exception const &e) {
+    LOG(WARNING) << "Failed to list NUMA nodes: " << e.what();
+  }
+#endif  // defined(__linux__)
+  return -1;
+}
+}  // namespace xgboost::common
diff --git a/src/common/numa_topo.h b/src/common/numa_topo.h
new file mode 100644
index 000000000000..1ab586c12e7e
--- /dev/null
+++ b/src/common/numa_topo.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>     // for int32_t
+#include <filesystem>  // for path
+#include <vector>      // for vector
+
+namespace xgboost::common {
+/**
+ * @brief Read a file with the `cpulist` format.
+ *
+ *   Linux-Only.
+ *
+ */
+void ReadCpuList(std::filesystem::path const &path, std::vector<std::int32_t> *p_cpus);
+
+/**
+ * @brief Get the list of CPU cores grouped under the NUMA node.
+ *
+ *   Linux-Only.
+ *
+ */
+void GetNumaNodeCpus(std::int32_t node_id, std::vector<std::int32_t> *p_cpus);
+
+/**
+ * @brief Find the maximum number of NUMA nodes.
+ *
+ *   Linux-Only.
+ *
+ * @return -1 if fail to get the number of nodes. Otherwise, the maximum number of nodes
+ *         for allocating node mask.
+ */
+[[nodiscard]] std::int32_t GetNumaMaxNumNodes();
+
+/**
+ * @brief Check whether the memory policy is set to bind.
+ *
+ *   Linux-Only.
+ *
+ */
+[[nodiscard]] bool GetNumaMemBind();
+
+/**
+ * @brief Get the number of configured NUMA nodes. This does not represent the highest
+ *        node ID as NUMA node ID doesn't have to be contiguous.
+ *
+ *   Linux-Only.
+ *
+ * @return -1 if there's no NUMA node. Otherwise, returns the number of NUMA nodes.
+ */
+[[nodiscard]] std::int32_t GetNumaNumNodes();
+}  // namespace xgboost::common
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 8739daba171a..9691d32adf88 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -112,11 +112,11 @@ template <typename T>
   if (ConsoleLogger::GlobalVerbosity() == ConsoleLogger::LogVerbosity::kDebug) {
     dh::CUDAEvent start{false}, stop{false};
     float milliseconds = 0;
-    start.Record(dh::DefaultStream());
+    start.Record(ctx.CUDACtx()->Stream());
 
     dispatch();
 
-    stop.Record(dh::DefaultStream());
+    stop.Record(ctx.CUDACtx()->Stream());
     stop.Sync();
     dh::safe_cuda(cudaEventElapsedTime(&milliseconds, start, stop));
     double n_bytes = page->Impl()->MemCostBytes();
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index a76c1d59ad8c..6d4079abbc85 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -14,17 +14,18 @@
 
 #include "../common/compressed_iterator.h"  // for CompressedByteT
 #include "../common/cuda_rt_utils.h"        // for SupportsPageableMem, SupportsAts
-#include "../common/device_compression.h"
-#include "../common/hist_util.h"          // for HistogramCuts
-#include "../common/ref_resource_view.h"  // for RefResourceView
-#include "../data/batch_utils.h"          // for AutoHostRatio
-#include "ellpack_page.h"                 // for EllpackPage
-#include "ellpack_page_raw_format.h"      // for EllpackPageRawFormat
-#include "sparse_page_source.h"           // for PageSourceIncMixIn
-#include "xgboost/base.h"                 // for bst_idx_t
-#include "xgboost/context.h"              // for DeviceOrd
-#include "xgboost/data.h"                 // for BatchParam
-#include "xgboost/span.h"                 // for Span
+#include "../common/device_compression.h"   // for SnappyDecomprMgr
+#include "../common/hist_util.h"            // for HistogramCuts
+#include "../common/numa_topo.h"            // for GetNumaNumNodes, GetNumaMemBind
+#include "../common/ref_resource_view.h"    // for RefResourceView
+#include "../data/batch_utils.h"            // for AutoHostRatio
+#include "ellpack_page.h"                   // for EllpackPage
+#include "ellpack_page_raw_format.h"        // for EllpackPageRawFormat
+#include "sparse_page_source.h"             // for PageSourceIncMixIn
+#include "xgboost/base.h"                   // for bst_idx_t
+#include "xgboost/context.h"                // for DeviceOrd
+#include "xgboost/data.h"                   // for BatchParam
+#include "xgboost/span.h"                   // for Span
 
 namespace xgboost::curt {
 class StreamPool;
@@ -201,6 +202,9 @@ class EllpackFormatPolicy {
                    << "The latest version of CTK supported by the current driver: " << major << "."
                    << minor << "." << msg;
     }
+    if (common::GetNumaNumNodes() > 1 && !common::GetNumaMemBind()) {
+      LOG(WARNING) << "Running on a NUMA system without membind." << msg;
+    }
   }
   // For testing with the HMM flag.
   explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
diff --git a/tests/cpp/common/test_numa_topo.cc b/tests/cpp/common/test_numa_topo.cc
new file mode 100644
index 000000000000..22f53ac20ce9
--- /dev/null
+++ b/tests/cpp/common/test_numa_topo.cc
@@ -0,0 +1,152 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <filesystem>  // for path
+#include <fstream>     // for ofstream
+#include <vector>      // for vector
+
+#include "../../../src/common/numa_topo.h"
+#include "../filesystem.h"
+
+namespace xgboost::common {
+namespace {
+namespace fs = std::filesystem;
+}
+
+TEST(Numa, CpuListParser) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = fs::path{tmpdir.path} / "cpulist";
+  std::vector<std::int32_t> cpus;
+
+  auto write = [&](auto const& cpulist) {
+    std::ofstream fout{path};
+    fout << cpulist;
+  };
+
+  {
+    std::string cpulist = R"(1
+)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    ASSERT_EQ(cpus[0], 1);
+    ASSERT_EQ(cpus.size(), 1);
+  }
+  {
+    std::string cpulist = R"(2)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    ASSERT_EQ(cpus.size(), 1);
+    ASSERT_EQ(cpus[0], 2);
+  }
+  {
+    std::string cpulist = R"(2,3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    ASSERT_EQ(cpus.size(), 2);
+    ASSERT_EQ(cpus[0], 2);
+    ASSERT_EQ(cpus[1], 3);
+  }
+
+  auto check_4cpu_case = [&] {
+    ASSERT_EQ(cpus.size(), 4);
+    for (std::size_t i = 0; i < cpus.size(); ++i) {
+      ASSERT_EQ(cpus[i], static_cast<std::int32_t>(i));
+    }
+  };
+  {
+    std::string cpulist = R"(0-3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    std::string cpulist = R"(0-2,3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    std::string cpulist = R"(0,1-3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    std::string cpulist = R"(0,1-2,3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    std::string cpulist = R"(0,1,2,3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    std::string cpulist = R"(0,1,2-3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    std::string cpulist = R"(0-1,2,3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    std::string cpulist = R"(0-1,2-3)";
+    write(cpulist);
+    ReadCpuList(path, &cpus);
+    check_4cpu_case();
+  }
+  {
+    auto path = fs::path{tmpdir.path} / "foo";
+    testing::internal::CaptureStderr();
+    ReadCpuList(path, &cpus);
+    std::string output = testing::internal::GetCapturedStderr();
+    ASSERT_TRUE(cpus.empty());
+    ASSERT_NE(output.find("foo"), std::string::npos);
+  }
+}
+
+TEST(Numa, GetCpus) {
+  std::vector<std::int32_t> cpus;
+  if (GetNumaNumNodes() > 0) {
+    GetNumaNodeCpus(0, &cpus);
+    ASSERT_FALSE(cpus.empty());
+  } else {
+    GTEST_SKIP();
+  }
+}
+
+TEST(Numa, GetMaxNumNodes) {
+  auto n_nodes = GetNumaMaxNumNodes();
+#if defined(__linux__)
+  ASSERT_GE(n_nodes, 0);
+#else
+  ASSERT_EQ(n_nodes, -1);
+#endif  // defined(__linux__)
+}
+
+TEST(Numa, GetMemBind) {
+  // You can run this test with:
+  // numactl --membind=0 ./testxgboost --gtest_filter="Numa.GetMemBind"
+  // or
+  // hwloc-bind --strict --membind node:0 ./testxgboost --gtest_filter="Numa.GetMemBind"
+  // The strict flag is required.
+  [[maybe_unused]] auto bind = GetNumaMemBind();
+}
+
+TEST(Numa, GetNumNodes) {
+  auto n_nodes = GetNumaNumNodes();
+#if defined(__linux__)
+  ASSERT_GE(n_nodes, 1);
+#else
+  ASSERT_EQ(n_nodes, -1);
+#endif  // defined(__linux__)
+}
+}  // namespace xgboost::common

From 82c536e5a8666ea793290e6b7b510045058aad01 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 15 Jul 2025 15:42:27 +0800
Subject: [PATCH 099/224] [pyspark] Remove the deprecated `use_gpu` parameter.
 (#11554)

---
 python-package/xgboost/spark/core.py          |  16 +--
 python-package/xgboost/spark/estimator.py     |  33 +-----
 python-package/xgboost/spark/params.py        |   2 +-
 .../test_gpu_with_spark/test_gpu_spark.py     |   2 +-
 .../test_with_spark/test_spark_local.py       | 101 +++++++++---------
 5 files changed, 53 insertions(+), 101 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 3bc45144db6a..7d337a10de00 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -123,7 +123,6 @@
     "qid_col",
     "repartition_random_shuffle",
     "pred_contrib_col",
-    "use_gpu",
     "launch_tracker_on_driver",
     "coll_cfg",
 ]
@@ -218,16 +217,6 @@ class _SparkXGBParams(
         ),
         TypeConverters.toString,
     )
-    use_gpu = Param(
-        Params._dummy(),
-        "use_gpu",
-        (
-            "Deprecated, use `device` instead. A boolean variable. Set use_gpu=true "
-            "if the executors are running on GPU instances. Currently, only one GPU per"
-            " task is supported."
-        ),
-        TypeConverters.toBoolean,
-    )
     force_repartition = Param(
         Params._dummy(),
         "force_repartition",
@@ -503,9 +492,7 @@ def _validate_params(self) -> None:
     def _run_on_gpu(self) -> bool:
         """If train or transform on the gpu according to the parameters"""
 
-        return use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(
-            self.use_gpu
-        )
+        return use_cuda(self.getOrDefault(self.device))
 
     def _col_is_defined_not_empty(self, param: "Param[str]") -> bool:
         return self.isDefined(param) and self.getOrDefault(param) not in (None, "")
@@ -628,7 +615,6 @@ def __init__(self) -> None:
         self._setDefault(
             num_workers=1,
             device="cpu",
-            use_gpu=False,
             force_repartition=False,
             repartition_random_shuffle=False,
             feature_names=None,
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 011f7ea0b715..935b013ab06c 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -1,9 +1,8 @@
 """Xgboost pyspark integration submodule for estimator API."""
 
-# pylint: disable=fixme, protected-access, no-member, invalid-name
+# pylint: disable=protected-access, no-member, invalid-name
 # pylint: disable=unused-argument, too-many-locals
 
-import warnings
 from typing import Any, List, Optional, Type, Union
 
 import numpy as np
@@ -77,12 +76,6 @@ def set_param_attrs(attr_name: str, param: Param) -> None:
         set_param_attrs(name, param_obj)
 
 
-def _deprecated_use_gpu() -> None:
-    warnings.warn(
-        "`use_gpu` is deprecated since 2.0.0, use `device` instead", FutureWarning
-    )
-
-
 class SparkXGBRegressor(_SparkXGBEstimator):
     """SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
     algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
@@ -140,11 +133,6 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     num_workers:
         How many XGBoost workers to be used to train.
         Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        .. deprecated:: 2.0.0
-
-        Use `device` instead.
-
     device:
 
         .. versionadded:: 2.0.0
@@ -214,7 +202,6 @@ def __init__(  # pylint:disable=too-many-arguments
         weight_col: Optional[str] = None,
         base_margin_col: Optional[str] = None,
         num_workers: int = 1,
-        use_gpu: Optional[bool] = None,
         device: Optional[str] = None,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
@@ -225,8 +212,6 @@ def __init__(  # pylint:disable=too-many-arguments
     ) -> None:
         super().__init__()
         input_kwargs = self._input_kwargs
-        if use_gpu:
-            _deprecated_use_gpu()
         self.setParams(**input_kwargs)
 
     @classmethod
@@ -327,11 +312,6 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     num_workers:
         How many XGBoost workers to be used to train.
         Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        .. deprecated:: 2.0.0
-
-        Use `device` instead.
-
     device:
 
         .. versionadded:: 2.0.0
@@ -401,7 +381,6 @@ def __init__(  # pylint:disable=too-many-arguments
         weight_col: Optional[str] = None,
         base_margin_col: Optional[str] = None,
         num_workers: int = 1,
-        use_gpu: Optional[bool] = None,
         device: Optional[str] = None,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
@@ -416,8 +395,6 @@ def __init__(  # pylint:disable=too-many-arguments
         # binary or multinomial input dataset, and we need to remove the fixed default
         # param value as well to avoid causing ambiguity.
         input_kwargs = self._input_kwargs
-        if use_gpu:
-            _deprecated_use_gpu()
         self.setParams(**input_kwargs)
         self._setDefault(objective=None)
 
@@ -517,11 +494,6 @@ class SparkXGBRanker(_SparkXGBEstimator):
     num_workers:
         How many XGBoost workers to be used to train.
         Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        .. deprecated:: 2.0.0
-
-        Use `device` instead.
-
     device:
 
         .. versionadded:: 2.0.0
@@ -597,7 +569,6 @@ def __init__(  # pylint:disable=too-many-arguments
         base_margin_col: Optional[str] = None,
         qid_col: Optional[str] = None,
         num_workers: int = 1,
-        use_gpu: Optional[bool] = None,
         device: Optional[str] = None,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
@@ -608,8 +579,6 @@ def __init__(  # pylint:disable=too-many-arguments
     ) -> None:
         super().__init__()
         input_kwargs = self._input_kwargs
-        if use_gpu:
-            _deprecated_use_gpu()
         self.setParams(**input_kwargs)
 
     @classmethod
diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py
index f173d3301286..af2f4f9d6588 100644
--- a/python-package/xgboost/spark/params.py
+++ b/python-package/xgboost/spark/params.py
@@ -39,7 +39,7 @@ class HasBaseMarginCol(Params):
 class HasFeaturesCols(Params):
     """
     Mixin for param features_cols: a list of feature column names.
-    This parameter is taken effect only when use_gpu is enabled.
+    This parameter is taken effect only when GPU is enabled.
     """
 
     features_cols = Param(
diff --git a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
index f389d6d26fa3..9a506f2c4b6f 100644
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -208,7 +208,7 @@ def test_cv_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature
     assert f1 >= 0.97
 
     clf = SparkXGBClassifier(
-        features_col=feature_names, use_gpu=True, num_workers=num_workers
+        features_col=feature_names, device="cuda", num_workers=num_workers
     )
     grid = ParamGridBuilder().addGrid(clf.max_depth, [6, 8]).build()
     evaluator = MulticlassClassificationEvaluator(metricName="f1")
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 4561edb47cd9..7bdd480b0c69 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -702,10 +702,10 @@ def test_classifier_model_pipeline_save_load(self, clf_data: ClfData) -> None:
             assert_model_compatible(model.stages[0], tmpdir)
 
     def test_classifier_with_cross_validator(self, clf_data: ClfData) -> None:
-        xgb_classifer = SparkXGBClassifier(n_estimators=1)
-        paramMaps = ParamGridBuilder().addGrid(xgb_classifer.max_depth, [1, 2]).build()
+        xgb_classifier = SparkXGBClassifier(n_estimators=1)
+        paramMaps = ParamGridBuilder().addGrid(xgb_classifier.max_depth, [1, 2]).build()
         cvBin = CrossValidator(
-            estimator=xgb_classifer,
+            estimator=xgb_classifier,
             estimatorParamMaps=paramMaps,
             evaluator=BinaryClassificationEvaluator(),
             seed=1,
@@ -944,9 +944,6 @@ def test_gpu_params(self) -> None:
         clf = SparkXGBClassifier(tree_method="hist")
         assert not clf._run_on_gpu()
 
-        clf = SparkXGBClassifier(use_gpu=True)
-        assert clf._run_on_gpu()
-
         clf = SparkXGBClassifier(device="cuda", tree_method="approx")
         assert clf._run_on_gpu()
 
@@ -988,22 +985,22 @@ def test_validate_gpu_params(self) -> None:
             .set("spark.executor.resource.gpu.amount", "1")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        classifer_on_cpu = SparkXGBClassifier(use_gpu=False)
-        classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
+        classifier_on_cpu = SparkXGBClassifier(device="cpu")
+        classifier_on_gpu = SparkXGBClassifier(device="cuda")
 
         # No exception for classifier on CPU
-        classifer_on_cpu._validate_gpu_params("3.4.0", standalone_conf)
+        classifier_on_cpu._validate_gpu_params("3.4.0", standalone_conf)
 
         with pytest.raises(
             ValueError, match="XGBoost doesn't support GPU fractional configurations"
         ):
-            classifer_on_gpu._validate_gpu_params("3.3.0", standalone_conf)
+            classifier_on_gpu._validate_gpu_params("3.3.0", standalone_conf)
 
         # No issues
-        classifer_on_gpu._validate_gpu_params("3.4.0", standalone_conf)
-        classifer_on_gpu._validate_gpu_params("3.4.1", standalone_conf)
-        classifer_on_gpu._validate_gpu_params("3.5.0", standalone_conf)
-        classifer_on_gpu._validate_gpu_params("3.5.1", standalone_conf)
+        classifier_on_gpu._validate_gpu_params("3.4.0", standalone_conf)
+        classifier_on_gpu._validate_gpu_params("3.4.1", standalone_conf)
+        classifier_on_gpu._validate_gpu_params("3.5.0", standalone_conf)
+        classifier_on_gpu._validate_gpu_params("3.5.1", standalone_conf)
 
         # no spark.executor.resource.gpu.amount
         standalone_bad_conf = (
@@ -1017,15 +1014,15 @@ def test_validate_gpu_params(self) -> None:
             "The `spark.executor.resource.gpu.amount` is required for training on GPU"
         )
         with pytest.raises(ValueError, match=msg_match):
-            classifer_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
+            classifier_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
         with pytest.raises(ValueError, match=msg_match):
-            classifer_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
+            classifier_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
         with pytest.raises(ValueError, match=msg_match):
-            classifer_on_gpu._validate_gpu_params("3.4.1", standalone_bad_conf)
+            classifier_on_gpu._validate_gpu_params("3.4.1", standalone_bad_conf)
         with pytest.raises(ValueError, match=msg_match):
-            classifer_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
+            classifier_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
         with pytest.raises(ValueError, match=msg_match):
-            classifer_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
+            classifier_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
 
         standalone_bad_conf = (
             SparkConf()
@@ -1038,11 +1035,11 @@ def test_validate_gpu_params(self) -> None:
             "The `spark.task.resource.gpu.amount` is required for training on GPU"
         )
         with pytest.raises(ValueError, match=msg_match):
-            classifer_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
+            classifier_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
 
-        classifer_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
-        classifer_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
-        classifer_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
+        classifier_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
+        classifier_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
+        classifier_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
 
         # Yarn and K8s mode
         for mode in ["yarn", "k8s://"]:
@@ -1058,24 +1055,24 @@ def test_validate_gpu_params(self) -> None:
                 ValueError,
                 match="XGBoost doesn't support GPU fractional configurations",
             ):
-                classifer_on_gpu._validate_gpu_params("3.3.0", conf)
+                classifier_on_gpu._validate_gpu_params("3.3.0", conf)
             with pytest.raises(
                 ValueError,
                 match="XGBoost doesn't support GPU fractional configurations",
             ):
-                classifer_on_gpu._validate_gpu_params("3.4.0", conf)
+                classifier_on_gpu._validate_gpu_params("3.4.0", conf)
             with pytest.raises(
                 ValueError,
                 match="XGBoost doesn't support GPU fractional configurations",
             ):
-                classifer_on_gpu._validate_gpu_params("3.4.1", conf)
+                classifier_on_gpu._validate_gpu_params("3.4.1", conf)
             with pytest.raises(
                 ValueError,
                 match="XGBoost doesn't support GPU fractional configurations",
             ):
-                classifer_on_gpu._validate_gpu_params("3.5.0", conf)
+                classifier_on_gpu._validate_gpu_params("3.5.0", conf)
 
-            classifer_on_gpu._validate_gpu_params("3.5.1", conf)
+            classifier_on_gpu._validate_gpu_params("3.5.1", conf)
 
         for mode in ["yarn", "k8s://"]:
             bad_conf = (
@@ -1089,13 +1086,13 @@ def test_validate_gpu_params(self) -> None:
                 "The `spark.task.resource.gpu.amount` is required for training on GPU"
             )
             with pytest.raises(ValueError, match=msg_match):
-                classifer_on_gpu._validate_gpu_params("3.3.0", bad_conf)
+                classifier_on_gpu._validate_gpu_params("3.3.0", bad_conf)
             with pytest.raises(ValueError, match=msg_match):
-                classifer_on_gpu._validate_gpu_params("3.4.0", bad_conf)
+                classifier_on_gpu._validate_gpu_params("3.4.0", bad_conf)
             with pytest.raises(ValueError, match=msg_match):
-                classifer_on_gpu._validate_gpu_params("3.5.0", bad_conf)
+                classifier_on_gpu._validate_gpu_params("3.5.0", bad_conf)
 
-            classifer_on_gpu._validate_gpu_params("3.5.1", bad_conf)
+            classifier_on_gpu._validate_gpu_params("3.5.1", bad_conf)
 
     def test_skip_stage_level_scheduling(self) -> None:
         standalone_conf = (
@@ -1107,27 +1104,27 @@ def test_skip_stage_level_scheduling(self) -> None:
             .set("spark.task.resource.gpu.amount", "0.08")
         )
 
-        classifer_on_cpu = SparkXGBClassifier(use_gpu=False)
-        classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
+        classifier_on_cpu = SparkXGBClassifier(device="cpu")
+        classifier_on_gpu = SparkXGBClassifier(device="cuda")
 
         # the correct configurations should not skip stage-level scheduling
-        assert not classifer_on_gpu._skip_stage_level_scheduling(
+        assert not classifier_on_gpu._skip_stage_level_scheduling(
             "3.4.0", standalone_conf
         )
-        assert not classifer_on_gpu._skip_stage_level_scheduling(
+        assert not classifier_on_gpu._skip_stage_level_scheduling(
             "3.4.1", standalone_conf
         )
-        assert not classifer_on_gpu._skip_stage_level_scheduling(
+        assert not classifier_on_gpu._skip_stage_level_scheduling(
             "3.5.0", standalone_conf
         )
-        assert not classifer_on_gpu._skip_stage_level_scheduling(
+        assert not classifier_on_gpu._skip_stage_level_scheduling(
             "3.5.1", standalone_conf
         )
 
         # spark version < 3.4.0
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", standalone_conf)
+        assert classifier_on_gpu._skip_stage_level_scheduling("3.3.0", standalone_conf)
         # not run on GPU
-        assert classifer_on_cpu._skip_stage_level_scheduling("3.4.0", standalone_conf)
+        assert classifier_on_cpu._skip_stage_level_scheduling("3.4.0", standalone_conf)
 
         # spark.executor.cores is not set
         bad_conf = (
@@ -1137,7 +1134,7 @@ def test_skip_stage_level_scheduling(self) -> None:
             .set("spark.executor.resource.gpu.amount", "1")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
+        assert classifier_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.executor.cores=1
         bad_conf = (
@@ -1148,7 +1145,7 @@ def test_skip_stage_level_scheduling(self) -> None:
             .set("spark.executor.resource.gpu.amount", "1")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
+        assert classifier_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.executor.resource.gpu.amount is not set
         bad_conf = (
@@ -1158,7 +1155,7 @@ def test_skip_stage_level_scheduling(self) -> None:
             .set("spark.task.cpus", "1")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
+        assert classifier_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.executor.resource.gpu.amount>1
         bad_conf = (
@@ -1169,7 +1166,7 @@ def test_skip_stage_level_scheduling(self) -> None:
             .set("spark.executor.resource.gpu.amount", "2")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
+        assert classifier_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.task.resource.gpu.amount is not set
         bad_conf = (
@@ -1179,7 +1176,7 @@ def test_skip_stage_level_scheduling(self) -> None:
             .set("spark.task.cpus", "1")
             .set("spark.executor.resource.gpu.amount", "1")
         )
-        assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
+        assert not classifier_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.task.resource.gpu.amount=1
         bad_conf = (
@@ -1190,7 +1187,7 @@ def test_skip_stage_level_scheduling(self) -> None:
             .set("spark.executor.resource.gpu.amount", "1")
             .set("spark.task.resource.gpu.amount", "1")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
+        assert classifier_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # For Yarn and K8S
         for mode in ["yarn", "k8s://"]:
@@ -1203,17 +1200,17 @@ def test_skip_stage_level_scheduling(self) -> None:
                     .set("spark.executor.resource.gpu.amount", "1")
                     .set("spark.task.resource.gpu.amount", gpu_amount)
                 )
-                assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
-                assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
-                assert classifer_on_gpu._skip_stage_level_scheduling("3.4.1", conf)
-                assert classifer_on_gpu._skip_stage_level_scheduling("3.5.0", conf)
+                assert classifier_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
+                assert classifier_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
+                assert classifier_on_gpu._skip_stage_level_scheduling("3.4.1", conf)
+                assert classifier_on_gpu._skip_stage_level_scheduling("3.5.0", conf)
 
                 # This will be fixed when spark 4.0.0 is released.
                 if gpu_amount == "1.0":
-                    assert classifer_on_gpu._skip_stage_level_scheduling("3.5.1", conf)
+                    assert classifier_on_gpu._skip_stage_level_scheduling("3.5.1", conf)
                 else:
                     # Starting from 3.5.1+, stage-level scheduling is working for Yarn and K8s
-                    assert not classifer_on_gpu._skip_stage_level_scheduling(
+                    assert not classifier_on_gpu._skip_stage_level_scheduling(
                         "3.5.1", conf
                     )
 

From 3d9db0bbdb94d105a982bfa199061ac852e18f1f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 16 Jul 2025 04:08:17 +0800
Subject: [PATCH 100/224] [EM] Remove text file input for external memory.
 (#11562)

---
 doc/tutorials/external_memory.rst             |  72 +++--------
 doc/tutorials/input_format.rst                |   5 +
 jvm-packages/xgboost4j-example/README.md      |   2 -
 .../java/example/ExternalMemory.java          |  61 ----------
 .../scala/example/ExternalMemory.scala        |  59 ---------
 .../java/example/JavaExamplesTest.java        |   2 -
 .../scala/example/ScalaExamplesTest.scala     |   2 -
 src/data/data.cc                              |  43 ++-----
 tests/cpp/common/test_hist_util.cc            |  20 +--
 tests/cpp/common/test_hist_util.cu            |   4 +-
 tests/cpp/common/test_hist_util.h             |  34 +-----
 tests/cpp/data/test_sparse_page_dmatrix.cc    | 114 ++++++++----------
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  21 ++--
 tests/cpp/helpers.cc                          |  28 ++++-
 tests/cpp/helpers.h                           |   4 +
 15 files changed, 132 insertions(+), 339 deletions(-)
 delete mode 100644 jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
 delete mode 100644 jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala

diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 6b2060d97657..657b809e7353 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -185,7 +185,7 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
     # It's important to use RMM for GPU-based external memory to improve performance.
     # If XGBoost is not built with RMM support, a warning will be raised.
     # We use the pool memory resource here for simplicity, you can also try the
-    # `ArenaMemoryResource` for # improved memory fragmentation handling.
+    # `ArenaMemoryResource` for improved memory fragmentation handling.
     mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
     rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
@@ -374,13 +374,14 @@ Best Practices
 **************
 
 In previous sections, we demonstrated how to train a tree-based model with data residing
-on an external memory and made some recommendations for batch size. Here are some other
-configurations we find useful. The external memory feature involves iterating through data
-batches stored in a cache during tree construction. For optimal performance, we recommend
-using the ``grow_policy=depthwise`` setting, which allows XGBoost to build an entire layer
-of tree nodes with only a few batch iterations. Conversely, using the ``lossguide`` policy
-requires XGBoost to iterate over the data set for each tree node, resulting in
-significantly slower performance.
+on an external memory. In addition, we made some recommendations for batch size and
+NUMA. Here are some other configurations we find useful. The external memory feature
+involves iterating through data batches stored in a cache during tree construction. For
+optimal performance, we recommend using the ``grow_policy=depthwise`` setting, which
+allows XGBoost to build an entire layer of tree nodes with only a few batch
+iterations. Conversely, using the ``lossguide`` policy requires XGBoost to iterate over
+the data set for each tree node, resulting in significantly slower performance (tree size
+is exponential to the depth).
 
 In addition, the ``hist`` tree method should be preferred over the ``approx`` tree method
 as the former doesn't recreate the histogram bins for every iteration. Creating the
@@ -399,10 +400,10 @@ When external memory is used, the performance of CPU training is limited by disk
 (input/output) speed. This means that the disk IO speed primarily determines the training
 speed. Similarly, PCIe bandwidth limits the GPU performance, assuming the CPU memory is
 used as a cache and address translation services (ATS) is unavailable. During development,
-we observed that typical data transfer in XGBoost with PCIe4x16 has about 24GB/s
-bandwidth, which is significantly lower than the GPU processing performance. Whereas with
-a C2C-enabled machine, the performance of data transfer and processing in training are
-close to each other.
+we observed that typical data transfer in XGBoost with PCIe4x16 has about 24GB/s bandwidth
+and about 42GB/s with PCIe5, which is significantly lower than the GPU processing
+performance. Whereas with a C2C-enabled machine, the performance of data transfer and
+processing in training are close to each other.
 
 Running inference is much less computation-intensive than training and, hence, much
 faster. As a result, the performance bottleneck of inference is back to data transfer. For
@@ -422,7 +423,7 @@ that the pool is not full yet (pool memory usage can be profiled with ``nsight-s
 consider using the :py:class:`~rmm.mr.ArenaMemoryResource` memory resource. Alternatively,
 using :py:class:`~rmm.mr.CudaAsyncMemoryResource` in conjunction with
 :py:class:`BinningMemoryResource(mr, 21, 25) <rmm.mr.BinningMemoryResource>` instead of
-the default :py:class:`~rmm.mr.PoolMemoryResource` can be an option.
+the default :py:class:`~rmm.mr.PoolMemoryResource`.
 
 During CPU benchmarking, we used an NVMe connected to a PCIe-4 slot. Other types of
 storage can be too slow for practical usage. However, your system will likely perform some
@@ -518,47 +519,4 @@ undergone multiple development iterations. Here's a brief summary of major chang
 - 3.1 added support for having divided cache pages. One can have part of a cache page in
   the GPU and the rest of the cache in the host memory. In addition, XGBoost works with
   the Grace Blackwell hardware decompression engine when data is sparse.
-
-****************
-Text File Inputs
-****************
-
-.. warning::
-
-   This is the original form of external memory support before 1.5 and is now deprecated,
-   users are encouraged to use a custom data iterator instead.
-
-There is no significant difference between using the external memory version of text input
-and the in-memory version of text input. The only difference is the filename format.
-
-The external memory version takes in the following `URI
-<https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
-
-.. code-block:: none
-
-  filename?format=libsvm#cacheprefix
-
-The ``filename`` is the typical path to LIBSVM format file you want to load in, and
-``cacheprefix`` is a path to a cache file that XGBoost will use for caching preprocessed
-data in binary form.
-
-To load from csv files, use the following syntax:
-
-.. code-block:: none
-
-  filename.csv?format=csv&label_column=0#cacheprefix
-
-where ``label_column`` should point to the csv column acting as the label.
-
-If you have a dataset stored in a file similar to ``demo/data/agaricus.txt.train`` with LIBSVM
-format, the external memory support can be enabled by:
-
-.. code-block:: python
-
-  dtrain = DMatrix('../data/agaricus.txt.train?format=libsvm#dtrain.cache')
-
-XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to a new file named
-``dtrain.cache`` as an on disk cache for storing preprocessed data in an internal binary format. For
-more notes about text input formats, see :doc:`/tutorials/input_format`.
-
-For the CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
+- The text file cache format has been removed in 3.1.0.
\ No newline at end of file
diff --git a/doc/tutorials/input_format.rst b/doc/tutorials/input_format.rst
index 415ee3471362..03e5092a11a9 100644
--- a/doc/tutorials/input_format.rst
+++ b/doc/tutorials/input_format.rst
@@ -6,6 +6,11 @@ Text Input Format of DMatrix
 
 Here we will briefly describe the text input formats for XGBoost. However, for users with access to a supported language environment like Python or R, it's recommended to use data parsers from that ecosystem instead. For instance, :py:func:`sklearn.datasets.load_svmlight_file`.
 
+.. warning::
+
+   As stated above, users are encouraged to use third-party data parsers. The text parsers
+   in XGBoost have been deprecated.
+
 ******************
 Basic Input Format
 ******************
diff --git a/jvm-packages/xgboost4j-example/README.md b/jvm-packages/xgboost4j-example/README.md
index 50f268e83ff3..b678fd8c91a7 100644
--- a/jvm-packages/xgboost4j-example/README.md
+++ b/jvm-packages/xgboost4j-example/README.md
@@ -9,7 +9,6 @@ XGBoost4J Code Examples
 * [Generalized Linear Model](src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java)
 * [Cross validation](src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java)
 * [Predicting leaf indices](src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java)
-* [External Memory](src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java)
 * [Early Stopping](src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java)
 
 ## Scala API
@@ -21,7 +20,6 @@ XGBoost4J Code Examples
 * [Generalized Linear Model](src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala)
 * [Cross validation](src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala)
 * [Predicting leaf indices](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala)
-* [External Memory](src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala)
 
 ## Spark API
 * [Distributed Training with Spark](src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala)
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
deleted file mode 100644
index 70b2b85b5315..000000000000
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-package ml.dmlc.xgboost4j.java.example;
-
-import java.util.HashMap;
-
-import ml.dmlc.xgboost4j.java.Booster;
-import ml.dmlc.xgboost4j.java.DMatrix;
-import ml.dmlc.xgboost4j.java.XGBoost;
-import ml.dmlc.xgboost4j.java.XGBoostError;
-
-/**
- * simple example for using external memory version
- *
- * @author hzx
- */
-public class ExternalMemory {
-  public static void main(String[] args) throws XGBoostError {
-    //this is the only difference, add a # followed by a cache prefix name
-    //several cache file with the prefix will be generated
-    //currently only support convert from libsvm file
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache");
-
-    //specify parameters
-    HashMap<String, Object> params = new HashMap<String, Object>();
-    params.put("eta", 1.0);
-    params.put("max_depth", 2);
-    params.put("silent", 1);
-    params.put("objective", "binary:logistic");
-
-    //performance notice: set nthread to be the number of your real cpu
-    //some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case
-    // set nthread=4
-    //param.put("nthread", num_real_cpu);
-
-    //specify watchList
-    HashMap<String, DMatrix> watches = new HashMap<String, DMatrix>();
-    watches.put("train", trainMat);
-    watches.put("test", testMat);
-
-    //set round
-    int round = 2;
-
-    //train a boost model
-    Booster booster = XGBoost.train(trainMat, params, round, watches, null, null);
-  }
-}
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
deleted file mode 100644
index d35715e3c733..000000000000
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- Copyright (c) 2014-2024 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.example
-
-import scala.collection.mutable
-
-import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost}
-
-object ExternalMemory {
-  def main(args: Array[String]): Unit = {
-    // this is the only difference, add a # followed by a cache prefix name
-    // several cache file with the prefix will be generated
-    // currently only support convert from libsvm file
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache")
-
-    val params = new mutable.HashMap[String, Any]()
-    params += "eta" -> 1.0
-    params += "max_depth" -> 2
-    params += "silent" -> 1
-    params += "objective" -> "binary:logistic"
-
-    // performance notice: set nthread to be the number of your real cpu
-    // some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case
-    // set nthread=4
-    // param.put("nthread", num_real_cpu);
-
-    val watches = new mutable.HashMap[String, DMatrix]
-    watches += "train" -> trainMat
-    watches += "test" -> testMat
-
-    val round = 2
-    // train a model
-    val booster = XGBoost.train(trainMat, params.toMap, round, watches.toMap)
-
-    val trainPred = booster.predict(trainMat, true)
-    val testPred = booster.predict(testMat, true)
-
-    trainMat.setBaseMargin(trainPred)
-    testMat.setBaseMargin(testPred)
-
-    System.out.println("result of running from initial prediction")
-    XGBoost.train(trainMat, params.toMap, 1, watches.toMap)
-  }
-}
diff --git a/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java b/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java
index 74dc2f3938d9..6dd42ab37439 100644
--- a/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java
+++ b/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java
@@ -35,8 +35,6 @@ public void testExamples() throws XGBoostError, IOException {
     CustomObjective.main(args);
     System.out.println("EarlyStopping");
     EarlyStopping.main(args);
-    System.out.println("ExternalMemory");
-    ExternalMemory.main(args);
     System.out.println("GeneralizedLinearModel");
     GeneralizedLinearModel.main(args);
     System.out.println("PredictFirstNtree");
diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala
index d7705f90e5ce..0f9dc35014cc 100644
--- a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala
@@ -28,8 +28,6 @@ class ScalaExamplesTest extends AnyFunSuite {
     CrossValidation.main(args)
     println("CustomObjective")
     CustomObjective.main(args)
-    println("ExternalMemory")
-    ExternalMemory.main(args)
     println("GeneralizedLinearModel")
     GeneralizedLinearModel.main(args)
     println("PredictFirstNTree")
diff --git a/src/data/data.cc b/src/data/data.cc
index b8f0e409ffe2..b4e18b406b7b 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -919,16 +919,10 @@ DMatrix* TryLoadBinary(std::string fname, bool silent) {
 }  // namespace
 
 DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
-  std::string fname, cache_file;
   auto dlm_pos = uri.find('#');
-  if (dlm_pos != std::string::npos) {
-    cache_file = uri.substr(dlm_pos + 1, uri.length());
-    fname = uri.substr(0, dlm_pos);
-    CHECK_EQ(cache_file.find('#'), std::string::npos)
-        << "Only one `#` is allowed in file path for cache file specification.";
-  } else {
-    fname = uri;
-  }
+  CHECK(dlm_pos == std::string::npos)
+      << "External memory training with text input has been removed.";
+  std::string fname = uri;
 
   // legacy handling of binary data loading
   DMatrix* loaded = TryLoadBinary(fname, silent);
@@ -937,30 +931,13 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
   }
 
   int partid = 0, npart = 1;
-  DMatrix* dmat{};
-
-  if (cache_file.empty()) {
-    fname = data::ValidateFileFormat(fname);
-    std::unique_ptr<dmlc::Parser<std::uint32_t>> parser(
-        dmlc::Parser<std::uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
-    data::FileAdapter adapter(parser.get());
-    dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
-                           cache_file, data_split_mode);
-  } else {
-    CHECK(data_split_mode != DataSplitMode::kCol)
-        << "Column-wise data split is not supported for external memory.";
-    data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
-    auto config = ExtMemConfig{cache_file,
-                               false,
-                               cuda_impl::AutoHostRatio(),
-                               cuda_impl::MatchingPageBytes(),
-                               std::numeric_limits<float>::quiet_NaN(),
-                               1};
-    dmat = new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset,
-                                       data::fileiter::Next, config};
-  }
 
-  return dmat;
+  fname = data::ValidateFileFormat(fname);
+  std::unique_ptr<dmlc::Parser<std::uint32_t>> parser(
+      dmlc::Parser<std::uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
+  data::FileAdapter adapter(parser.get());
+  return DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(), "",
+                         data_split_mode);
 }
 
 template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
@@ -1029,7 +1006,7 @@ INSTANTIATION_CREATE(ColumnarAdapter)
 
 template DMatrix* DMatrix::Create(
     data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
-    float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
+    float missing, int nthread, std::string const& cache_prefix, DataSplitMode data_split_mode);
 
 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
   SparsePage transpose;
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 24e67c9aa4e6..ee115a95921a 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -1,18 +1,21 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
+#include "test_hist_util.h"
+
 #include <gtest/gtest.h>
-#include <vector>
+#include <xgboost/data.h>                // for ExtMemConfig
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+
+#include <memory>      // for shared_ptr
 #include <string>
+#include <vector>
 
 #include "../../../src/common/hist_util.h"
 #include "../../../src/data/gradient_index.h"
 #include "../helpers.h"
-#include "test_hist_util.h"
-
-namespace xgboost {
-namespace common {
 
+namespace xgboost::common {
 void ParallelGHistBuilderReset() {
   constexpr size_t kBins = 10;
   constexpr size_t kNodes = 5;
@@ -258,7 +261,7 @@ TEST(HistUtil, DenseCutsExternalMemory) {
   int num_columns = 5;
   Context ctx;
   for (auto num_rows : sizes) {
-    auto x = GenerateRandom(num_rows, num_columns);
+    HostDeviceVector<float> x{GenerateRandom(num_rows, num_columns)};
     dmlc::TemporaryDirectory tmpdir;
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, tmpdir);
     for (auto num_bins : bin_sizes) {
@@ -405,5 +408,4 @@ TEST(HistUtil, SketchCategoricalFeatures) {
     return SketchOnDMatrix(&ctx, p_fmat, num_bins);
   });
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 6957fbb8ecdf..63333a121cf7 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -333,7 +333,7 @@ TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {
   auto sizes = {100, 1000, 1500};
   int num_columns =5;
   for (auto num_rows : sizes) {
-    auto x = GenerateRandom(num_rows, num_columns);
+    HostDeviceVector<float> x{GenerateRandom(num_rows, num_columns)};
     dmlc::TemporaryDirectory temp;
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
     for (auto num_bins : bin_sizes) {
@@ -351,7 +351,7 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
   int num_columns = 5;
   dmlc::TemporaryDirectory temp;
   for (auto num_rows : sizes) {
-    auto x = GenerateRandom(num_rows, num_columns);
+    HostDeviceVector<float> x{GenerateRandom(num_rows, num_columns)};
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
     dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows);
     for (auto num_bins : bin_sizes) {
diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h
index b8de641ffd7b..f4818b82fe8c 100644
--- a/tests/cpp/common/test_hist_util.h
+++ b/tests/cpp/common/test_hist_util.h
@@ -1,10 +1,10 @@
-/*!
- * Copyright 2019-2022 by XGBoost Contributors
+/**
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #pragma once
 #include <gtest/gtest.h>
 
-#include <fstream>
+#include <memory>  // for shared_ptr
 #include <random>
 #include <string>
 #include <vector>
@@ -12,19 +12,17 @@
 #include "../../../src/common/hist_util.h"
 #include "../../../src/data/adapter.h"
 #include "../../../src/data/simple_dmatrix.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 #ifdef __CUDACC__
 #include <xgboost/json.h>
+
 #include "../../../src/data/device_adapter.cuh"
 #endif  // __CUDACC__
 
 // Some helper functions used to test both GPU and CPU algorithms
 //
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
   // Generate columns with different ranges
 inline std::vector<float> GenerateRandom(int num_rows, int num_columns) {
   std::vector<float> x(num_rows*num_columns);
@@ -73,25 +71,6 @@ GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns) {
       &adapter, std::numeric_limits<float>::quiet_NaN(), 1));
 }
 
-inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
-    const std::vector<float>& x, int num_rows, int num_columns,
-    const dmlc::TemporaryDirectory& tempdir) {
-  // Create the svm file in a temp dir
-  const std::string tmp_file = tempdir.path + "/temp.libsvm";
-  std::ofstream fo(tmp_file.c_str());
-  for (auto i = 0; i < num_rows; i++) {
-    std::stringstream row_data;
-    for (auto j = 0; j < num_columns; j++) {
-      row_data << 1 << " " << j << ":" << std::setprecision(15)
-               << x[i * num_columns + j];
-    }
-    fo << row_data.str() << "\n";
-  }
-  fo.close();
-  return std::shared_ptr<DMatrix>(
-      DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"));
-}
-
 // Test that elements are approximately equally distributed among bins
 inline void TestBinDistribution(const HistogramCuts& cuts, int column_idx,
                                 const std::vector<float>& sorted_column,
@@ -260,5 +239,4 @@ void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins,
     ASSERT_EQ(x[i], values[i]);
   }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index b84cb63ff91f..74d0c814b681 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -3,66 +3,50 @@
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
 
 #include <future>
 #include <thread>
 
 #include "../../../src/common/io.h"
-#include "../../../src/data/adapter.h"
 #include "../../../src/data/batch_utils.h"  // for MatchingPageBytes
-#include "../../../src/data/file_iterator.h"
-#include "../../../src/data/simple_dmatrix.h"
 #include "../../../src/data/sparse_page_dmatrix.h"
 #include "../../../src/tree/param.h"  // for TrainParam
 #include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 using namespace xgboost;  // NOLINT
-namespace {
-std::string UriSVM(std::string name, std::string cache) {
-  return name + "?format=libsvm" + "#" + cache + ".cache";
-}
-}  // namespace
-
 template <typename Page>
-void TestSparseDMatrixLoadFile(Context const* ctx) {
-  dmlc::TemporaryDirectory tmpdir;
-  auto opath = tmpdir.path + "/1-based.svm";
-  CreateBigTestData(opath, 3 * 64, false);
-  opath += "?indexing_mode=1&format=libsvm";
-  data::FileIterator iter{opath, 0, 1};
-  auto n_threads = 0;
+void TestSparseDMatrixLoad(Context const *ctx) {
+  auto m = RandomDataGenerator{1024, 5, 0.0}.Batches(4).GenerateSparsePageDMatrix("temp", true);
 
-  auto config = ExtMemConfig{tmpdir.path + "cache",
+  auto n_threads = 0;
+  auto config = ExtMemConfig{"temp",
                              false,
                              ::xgboost::cuda_impl::AutoHostRatio(),
                              cuda_impl::MatchingPageBytes(),
                              std::numeric_limits<float>::quiet_NaN(),
                              n_threads};
-  data::SparsePageDMatrix m{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
-                            config};
-  ASSERT_EQ(AllThreadsForTest(), m.Ctx()->Threads());
-  ASSERT_EQ(m.Info().num_col_, 5);
-  ASSERT_EQ(m.Info().num_row_, 64);
-
-  std::unique_ptr<dmlc::Parser<uint32_t>> parser(
-      dmlc::Parser<uint32_t>::Create(opath.c_str(), 0, 1, "auto"));
-  auto adapter = data::FileAdapter{parser.get()};
-
-  data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
-                             1};
+  ASSERT_EQ(AllThreadsForTest(), m->Ctx()->Threads());
+  ASSERT_EQ(m->Info().num_col_, 5);
+  ASSERT_EQ(m->Info().num_row_, 1024);
+
+  auto simple = RandomDataGenerator{1024, 5, 0.0}.GenerateDMatrix(true);
   Page out;
-  for (auto const &page : m.GetBatches<Page>(ctx)) {
+  for (auto const &page : m->GetBatches<Page>(ctx)) {
     if (std::is_same_v<Page, SparsePage>) {
       out.Push(page);
     } else {
       out.PushCSC(page);
     }
   }
-  ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
-  ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
+  ASSERT_EQ(m->Info().num_col_, simple->Info().num_col_);
+  ASSERT_EQ(m->Info().num_row_, simple->Info().num_row_);
+  if (std::is_same_v<Page, SortedCSCPage>) {
+    out.SortRows(ctx->Threads());
+  }
 
-  for (auto const& page : simple.GetBatches<Page>(ctx)) {
+  for (auto const &page : simple->GetBatches<Page>(ctx)) {
     ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
     for (size_t i = 0; i < page.data.Size(); ++i) {
       ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
@@ -70,11 +54,11 @@ void TestSparseDMatrixLoadFile(Context const* ctx) {
   }
 }
 
-TEST(SparsePageDMatrix, LoadFile) {
+TEST(SparsePageDMatrix, Load) {
   Context ctx;
-  TestSparseDMatrixLoadFile<SparsePage>(&ctx);
-  TestSparseDMatrixLoadFile<CSCPage>(&ctx);
-  TestSparseDMatrixLoadFile<SortedCSCPage>(&ctx);
+  TestSparseDMatrixLoad<SparsePage>(&ctx);
+  TestSparseDMatrixLoad<CSCPage>(&ctx);
+  TestSparseDMatrixLoad<SortedCSCPage>(&ctx);
 }
 
 // allow caller to retain pages so they can process multiple pages at the same time.
@@ -245,16 +229,13 @@ TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
 
 TEST(SparsePageDMatrix, MetaInfo) {
   dmlc::TemporaryDirectory tmpdir;
-  const std::string tmp_file = tmpdir.path + "/simple.libsvm";
-  size_t constexpr kEntries = 24;
-  CreateBigTestData(tmp_file, kEntries);
-
-  std::unique_ptr<DMatrix> dmat{xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file), false)};
+  auto dmat = RandomDataGenerator{256, 5, 0.0}.Batches(4).GenerateSparsePageDMatrix(
+      tmpdir.path + "/", true);
 
   // Test the metadata that was parsed
-  EXPECT_EQ(dmat->Info().num_row_, 8ul);
+  EXPECT_EQ(dmat->Info().num_row_, 256ul);
   EXPECT_EQ(dmat->Info().num_col_, 5ul);
-  EXPECT_EQ(dmat->Info().num_nonzero_, kEntries);
+  EXPECT_EQ(dmat->Info().num_nonzero_, dmat->Info().num_col_ * dmat->Info().num_row_);
   EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_);
 }
 
@@ -272,11 +253,15 @@ TEST(SparsePageDMatrix, RowAccess) {
 
 TEST(SparsePageDMatrix, ColAccess) {
   dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/simple.libsvm";
-  CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file));
   Context ctx;
 
+  auto nan = std::numeric_limits<float>::quiet_NaN();
+  HostDeviceVector<float> x{
+      0, 10,  20,  nan, nan,  // row-0
+      0, nan, nan, 30,  40    // row-1
+  };
+  auto dmat = GetExternalMemoryDMatrixFromData(x, 2, 5, tempdir, 2);
+
   // Loop over the batches and assert the data is as expected
   size_t iter = 0;
   for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>(&ctx)) {
@@ -291,7 +276,7 @@ TEST(SparsePageDMatrix, ColAccess) {
       ASSERT_EQ(col_page[1][0].fvalue, 10.0f);
       ASSERT_EQ(col_page[1].size(), 1);
     }
-    CHECK_LE(col_batch.base_rowid, dmat->Info().num_row_);
+    ASSERT_LE(col_batch.base_rowid, dmat->Info().num_row_);
     ++iter;
   }
 
@@ -299,17 +284,16 @@ TEST(SparsePageDMatrix, ColAccess) {
   iter = 0;
   for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
     auto col_page = col_batch.GetView();
-    EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
+    ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
     if (iter == 0) {
-      EXPECT_EQ(col_page[1][0].fvalue, 10.0f);
-      EXPECT_EQ(col_page[1].size(), 1);
+      ASSERT_EQ(col_page[1][0].fvalue, 10.0f);
+      ASSERT_EQ(col_page[1].size(), 1);
     } else {
-      EXPECT_EQ(col_page[3][0].fvalue, 30.f);
-      EXPECT_EQ(col_page[3].size(), 1);
+      ASSERT_EQ(col_page[3][0].fvalue, 30.f);
+      ASSERT_EQ(col_page[3].size(), 1);
     }
     iter++;
   }
-  delete dmat;
 }
 
 TEST(SparsePageDMatrix, ThreadSafetyException) {
@@ -358,29 +342,27 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
   }
 }
 
-auto TestSparsePageDMatrixDeterminism(int32_t threads) {
+auto TestSparsePageDMatrixDeterminism(std::int32_t n_threads) {
   std::vector<float> sparse_data;
   std::vector<size_t> sparse_rptr;
   std::vector<bst_feature_t> sparse_cids;
-  dmlc::TemporaryDirectory tempdir;
-  std::string filename = tempdir.path + "/simple.libsvm";
-  CreateBigTestData(filename, 1 << 16);
 
-  data::FileIterator iter(filename + "?format=libsvm", 0, 1);
-  auto config = ExtMemConfig{filename,
+  dmlc::TemporaryDirectory tmpdir;
+  auto prefix = (std::filesystem::path{tmpdir.path} / "temp").string();
+  auto dmat = RandomDataGenerator{4096, 64, 0.0}.Batches(4).GenerateSparsePageDMatrix(prefix, true);
+
+  auto config = ExtMemConfig{prefix,
                              false,
                              ::xgboost::cuda_impl::AutoHostRatio(),
                              cuda_impl::MatchingPageBytes(),
                              std::numeric_limits<float>::quiet_NaN(),
-                             threads};
-  std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
-      &iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next, config}};
-  CHECK(sparse->Ctx()->Threads() == threads || sparse->Ctx()->Threads() == AllThreadsForTest());
+                             n_threads};
+  CHECK(dmat->Ctx()->Threads() == n_threads || dmat->Ctx()->Threads() == AllThreadsForTest());
 
-  DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
+  DMatrixToCSR(dmat.get(), &sparse_data, &sparse_rptr, &sparse_cids);
 
   auto cache_name =
-      data::MakeId(filename, dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) + ".row.page";
+      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix *>(dmat.get())) + ".row.page";
   auto cache = common::LoadSequentialFile(cache_name);
   return cache;
 }
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index b4d0801a15d4..d3e29ca31cb1 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -8,34 +8,27 @@
 #include "../../../src/data/ellpack_page.h"
 #include "../../../src/data/sparse_page_dmatrix.h"
 #include "../../../src/tree/param.h"  // TrainParam
-#include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost {
-
 TEST(SparsePageDMatrix, EllpackPage) {
   auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/simple.libsvm";
-  CreateSimpleTestData(tmp_file);
-  DMatrix* dmat = DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache");
+  auto dmat = RandomDataGenerator{512, 12, 0.0}.Batches(4).GenerateSparsePageDMatrix("temp", true);
 
   // Loop over the batches and assert the data is as expected
-  size_t n = 0;
+  std::size_t n = 0;
   for (const auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     n += batch.Size();
   }
   EXPECT_EQ(n, dmat->Info().num_row_);
 
-  auto path =
-      data::MakeId(tmp_file + ".cache", dynamic_cast<data::SparsePageDMatrix*>(dmat)) + ".row.page";
-  EXPECT_TRUE(FileExists(path));
-  path = data::MakeId(tmp_file + ".cache", dynamic_cast<data::SparsePageDMatrix*>(dmat)) +
+  auto path = data::MakeId("temp", std::dynamic_pointer_cast<data::SparsePageDMatrix>(dmat).get()) +
+              ".row.page";
+  ASSERT_TRUE(FileExists(path));
+  path = data::MakeId("temp", std::dynamic_pointer_cast<data::SparsePageDMatrix>(dmat).get()) +
          ".ellpack.page";
-  EXPECT_TRUE(FileExists(path));
-
-  delete dmat;
+  ASSERT_TRUE(FileExists(path));
 }
 
 TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 2db3201e474b..21525a52d8c5 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -3,8 +3,6 @@
  */
 #include "helpers.h"
 
-#include "../../src/data/batch_utils.h"  // for AutoHostRatio
-
 #include <gtest/gtest.h>
 #include <xgboost/gbm.h>
 #include <xgboost/json.h>
@@ -14,10 +12,12 @@
 #include <xgboost/objective.h>
 
 #include <algorithm>
-#include <limits>  // for numeric_limits
+#include <filesystem>  // for path
+#include <limits>      // for numeric_limits
 
 #include "../../src/collective/communicator-inl.h"  // for GetRank
 #include "../../src/data/adapter.h"
+#include "../../src/data/batch_utils.h"  // for AutoHostRatio, AutoCachePageBytes
 #include "../../src/data/iterative_dmatrix.h"
 #include "../../src/data/simple_dmatrix.h"
 #include "../../src/data/sparse_page_dmatrix.h"
@@ -611,6 +611,26 @@ std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::si
   return p_fmat;
 }
 
+[[nodiscard]] std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
+    HostDeviceVector<float> const& x, bst_idx_t n_samples, bst_feature_t n_features,
+    const dmlc::TemporaryDirectory& tempdir, bst_idx_t n_batches) {
+  Context ctx;
+  auto iter = NumpyArrayIterForTest{&ctx, x, n_samples / n_batches, n_features, n_batches};
+
+  auto prefix = std::filesystem::path{tempdir.path} / "temp";
+  auto config = ExtMemConfig{
+      prefix.string(),
+      false,
+      ::xgboost::cuda_impl::AutoHostRatio(),
+      ::xgboost::cuda_impl::AutoCachePageBytes(),
+      std::numeric_limits<float>::quiet_NaN(),
+      Context{}.Threads(),
+  };
+  std::shared_ptr<DMatrix> p_fmat{
+      DMatrix::Create(static_cast<DataIterHandle>(&iter), iter.Proxy(), Reset, Next, config)};
+  return p_fmat;
+}
+
 std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs, size_t kRows,
                                                   size_t kCols,
                                                   LearnerModelParam const* learner_model_param,
@@ -655,7 +675,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> c
   CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
   this->data_.Copy(data);
   std::tie(batches_, interface_) =
-      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device());
+      MakeArrayInterfaceBatch(&data_, rows_ * n_batches_, cols_, n_batches_, ctx->Device());
 }
 
 ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 6589bd95f176..31677d6215bf 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -370,6 +370,10 @@ inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t nu
 std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
                                             bst_feature_t num_columns);
 
+[[nodiscard]] std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
+    HostDeviceVector<float> const& x, bst_idx_t n_samples, bst_feature_t n_features,
+    const dmlc::TemporaryDirectory& tempdir, bst_idx_t n_batches = 4);
+
 std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs, size_t kRows,
                                                   size_t kCols,
                                                   LearnerModelParam const* learner_model_param,

From 16e3cb57fc70628b02a00710c2f61c9a2f4744ae Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 17 Jul 2025 06:08:05 +0800
Subject: [PATCH 101/224] Support categorical features from polars. (#11565)

---
 python-package/xgboost/_data_utils.py | 107 ++++++++++++++++++-----
 python-package/xgboost/data.py        | 118 ++++++++++++--------------
 src/data/adapter.h                    |   2 +-
 tests/python/test_with_polars.py      |  42 ++++++++-
 4 files changed, 177 insertions(+), 92 deletions(-)

diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
index c651fa03c709..c495fde07b14 100644
--- a/python-package/xgboost/_data_utils.py
+++ b/python-package/xgboost/_data_utils.py
@@ -239,6 +239,58 @@ def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
     return offsets.astype(np.int32), values
 
 
+@functools.cache
+def _arrow_npdtype() -> Dict[Any, Type[np.number]]:
+    import pyarrow as pa
+
+    mapping: Dict[Any, Type[np.number]] = {
+        pa.int8(): np.int8,
+        pa.int16(): np.int16,
+        pa.int32(): np.int32,
+        pa.int64(): np.int64,
+        pa.uint8(): np.uint8,
+        pa.uint16(): np.uint16,
+        pa.uint32(): np.uint32,
+        pa.uint64(): np.uint64,
+        pa.float16(): np.float16,
+        pa.float32(): np.float32,
+        pa.float64(): np.float64,
+    }
+
+    return mapping
+
+
+def _arrow_mask_inf(mask: Optional["pa.Buffer"], size: int) -> Optional[ArrayInf]:
+    if mask is not None:
+        jmask: Optional[ArrayInf] = {
+            "data": (mask.address, True),
+            "typestr": "<t1",
+            "version": 3,
+            "strides": None,
+            "shape": (size,),
+            "mask": None,
+        }
+        if not mask.is_cpu:
+            jmask["stream"] = STREAM_PER_THREAD  # type: ignore
+    else:
+        jmask = None
+    return jmask
+
+
+def _arrow_buf_inf(buf: "pa.Buffer", typestr: str, size: int) -> ArrayInf:
+    jdata: ArrayInf = {
+        "data": (buf.address, True),
+        "typestr": typestr,
+        "version": 3,
+        "strides": None,
+        "shape": (size,),
+        "mask": None,
+    }
+    if not buf.is_cpu:
+        jdata["stream"] = STREAM_PER_THREAD  # type: ignore
+    return jdata
+
+
 def _arrow_cat_inf(  # pylint: disable=too-many-locals
     cats: "pa.StringArray",
     codes: Union[_ArrayLikeArg, _CudaArrayLikeArg, "pa.IntegerArray"],
@@ -254,29 +306,28 @@ def _arrow_cat_inf(  # pylint: disable=too-many-locals
     assert offset.is_cpu
 
     off_len = len(cats) + 1
-    if offset.size != off_len * (np.iinfo(np.int32).bits / 8):
-        raise TypeError("Arrow dictionary type offsets is required to be 32 bit.")
-
-    joffset: ArrayInf = {
-        "data": (offset.address, True),
-        "typestr": "<i4",
-        "version": 3,
-        "strides": None,
-        "shape": (off_len,),
-        "mask": None,
-    }
 
-    def make_buf_inf(buf: pa.Buffer, typestr: str) -> ArrayInf:
-        return {
-            "data": (buf.address, True),
-            "typestr": typestr,
-            "version": 3,
-            "strides": None,
-            "shape": (buf.size,),
-            "mask": None,
-        }
+    def get_n_bytes(typ: Type) -> int:
+        return off_len * (np.iinfo(typ).bits // 8)
 
-    jdata = make_buf_inf(data, "<i1")
+    if offset.size == get_n_bytes(np.int64):
+        if not isinstance(cats, pa.LargeStringArray):
+            raise TypeError(
+                "Expecting `pyarrow.StringArray` or `pyarrow.LargeStringArray`,"
+                f" got: {type(cats)}."
+            )
+        # Convert to 32bit integer, arrow recommends against the use of i64. Also,
+        # XGBoost cannot handle large number of categories (> 2**31).
+        i32cats = cats.cast(pa.string())
+        mask, offset, data = i32cats.buffers()
+
+    if offset.size != get_n_bytes(np.int32):
+        raise TypeError(
+            "Arrow dictionary type offsets is required to be 32-bit integer."
+        )
+
+    joffset = _arrow_buf_inf(offset, "<i4", off_len)
+    jdata = _arrow_buf_inf(data, "|i1", data.size)
     # Categories should not have missing values.
     assert mask is None
 
@@ -290,9 +341,19 @@ def make_array_inf(
         if hasattr(array, "__cuda_array_interface__"):
             inf = cuda_array_interface_dict(array)
             return inf, None
+        if isinstance(array, pa.Array):
+            mask, data = array.buffers()
+            jdata = make_array_interface(
+                data.address,
+                shape=(len(array),),
+                dtype=_arrow_npdtype()[array.type],
+                is_cuda=not data.is_cpu,
+            )
+            jdata["mask"] = _arrow_mask_inf(mask, len(array))
+            return jdata, None
 
-        # Other types (like arrow itself) are not yet supported.
-        raise TypeError("Invalid input type.")
+        # Other types are not yet supported.
+        raise TypeError(f"Invalid input type: {type(array)}")
 
     cats_tmp = (mask, offset, data)
     jcodes, codes_tmp = make_array_inf(codes)
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 4ada55472348..da1457dbb07b 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -15,7 +15,7 @@
     Optional,
     Sequence,
     Tuple,
-    Type,
+    TypeAlias,
     TypeGuard,
     Union,
 )
@@ -27,6 +27,9 @@
     DfCatAccessor,
     StringArray,
     TransformedDf,
+    _arrow_cat_inf,
+    _arrow_mask_inf,
+    _arrow_npdtype,
     _ensure_np_dtype,
     _is_df_cat,
     array_hasobject,
@@ -768,25 +771,16 @@ def _from_pandas_series(
     )
 
 
-@functools.cache
-def _arrow_npdtype() -> Dict[Any, Type[np.number]]:
-    import pyarrow as pa
-
-    mapping: Dict[Any, Type[np.number]] = {
-        pa.int8(): np.int8,
-        pa.int16(): np.int16,
-        pa.int32(): np.int32,
-        pa.int64(): np.int64,
-        pa.uint8(): np.uint8,
-        pa.uint16(): np.uint16,
-        pa.uint32(): np.uint32,
-        pa.uint64(): np.uint64,
-        pa.float16(): np.float16,
-        pa.float32(): np.float32,
-        pa.float64(): np.float64,
-    }
-
-    return mapping
+# Type for storing JSON-encoded array interface
+AifType: TypeAlias = List[
+    Union[
+        ArrayInf,  # numeric column
+        Tuple[  # categorical column
+            Union[ArrayInf, StringArray],  # string index, numeric index
+            ArrayInf,  # codes
+        ],
+    ]
+]
 
 
 class ArrowTransformed(TransformedDf):
@@ -797,45 +791,49 @@ def __init__(
     ) -> None:
         self.columns = columns
 
-    def array_interface(self) -> bytes:
-        """Return a byte string for JSON encoded array interface."""
+        self.temporary_buffers: List[Tuple] = []
+
         if TYPE_CHECKING:
             import pyarrow as pa
         else:
             pa = import_pyarrow()
 
-        def array_inf(col: Union["pa.NumericArray", "pa.DictionaryArray"]) -> ArrayInf:
-            buffers = col.buffers()
+        aitfs: AifType = []
+
+        def push_series(col: Union["pa.NumericArray", "pa.DictionaryArray"]) -> None:
             if isinstance(col, pa.DictionaryArray):
-                mask, _, data = col.buffers()
+                cats = col.dictionary
+                codes = col.indices
+                if not isinstance(cats, (pa.StringArray, pa.LargeStringArray)):
+                    raise TypeError(
+                        "Only string-based categorical index is supported for arrow."
+                    )
+                jnames, jcodes, buf = _arrow_cat_inf(cats, codes)
+                self.temporary_buffers.append(buf)
+                aitfs.append((jnames, jcodes))
             else:
-                mask, data = buffers
+                mask, data = col.buffers()
 
-            assert data.is_cpu
-            assert col.offset == 0
+                assert data.is_cpu
+                assert col.offset == 0
 
-            jdata = make_array_interface(
-                data.address,
-                shape=(len(col),),
-                dtype=_arrow_npdtype()[col.type],
-                is_cuda=not data.is_cpu,
-            )
-            if mask is not None:
-                jmask: ArrayInf = {
-                    "data": (mask.address, True),
-                    "typestr": "<t1",
-                    "version": 3,
-                    "strides": None,
-                    "shape": (len(col),),
-                    "mask": None,
-                }
-                if not data.is_cpu:
-                    jmask["stream"] = 2  # type: ignore
-                jdata["mask"] = jmask
-            return jdata
-
-        arrays = list(map(array_inf, self.columns))
-        sarrays = bytes(json.dumps(arrays), "utf-8")
+                jdata = make_array_interface(
+                    data.address,
+                    shape=(len(col),),
+                    dtype=_arrow_npdtype()[col.type],
+                    is_cuda=not data.is_cpu,
+                )
+                jdata["mask"] = _arrow_mask_inf(mask, len(col))
+                aitfs.append(jdata)
+
+        for col in self.columns:
+            push_series(col)
+
+        self.aitfs = aitfs
+
+    def array_interface(self) -> bytes:
+        """Return a byte string for JSON encoded array interface."""
+        sarrays = bytes(json.dumps(self.aitfs), "utf-8")
         return sarrays
 
     @property
@@ -850,7 +848,7 @@ def _is_arrow(data: DataType) -> bool:
 
 def _transform_arrow_table(
     data: "pa.Table",
-    _: bool,  # not used yet, enable_categorical
+    enable_categorical: bool,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
 ) -> Tuple[ArrowTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
@@ -872,6 +870,10 @@ def _transform_arrow_table(
         col: Union["pa.NumericArray", "pa.DictionaryArray"] = col0.combine_chunks()
         if isinstance(col, pa.BooleanArray):
             col = col.cast(pa.int8())  # bit-compressed array, not supported.
+        if is_arrow_dict(col) and not enable_categorical:
+            # None because the function doesn't know how to get the type info from arrow
+            # table.
+            _invalid_dataframe_dtype(None)
         columns.append(col)
 
     df_t = ArrowTransformed(columns)
@@ -937,10 +939,6 @@ def _arrow_feature_info(data: DataType) -> Tuple[List[str], List]:
     def map_type(name: str) -> str:
         col = table.column(name)
         if isinstance(col.type, pa.DictionaryType):
-            raise NotImplementedError(
-                "Categorical feature is not yet supported with the current input data "
-                "type."
-            )
             return CAT_T  # pylint: disable=unreachable
 
         return _arrow_dtype()[col.type]
@@ -1073,15 +1071,7 @@ def __init__(self, columns: List[Union["PdSeries", DfCatAccessor]]) -> None:
         # the DMatrix or the booster.
         self.temporary_buffers: List[Tuple] = []
 
-        aitfs: List[
-            Union[
-                ArrayInf,  # numeric column
-                Tuple[  # categorical column
-                    Union[ArrayInf, StringArray],  # string index, numeric index
-                    ArrayInf,  # codes
-                ],
-            ]
-        ] = []
+        aitfs: AifType = []
 
         def push_series(ser: Any) -> None:
             if _is_df_cat(ser):
diff --git a/src/data/adapter.h b/src/data/adapter.h
index eb81a82b25d0..112600b91009 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -467,7 +467,7 @@ template <bool allow_mask, typename CategoricalIndex>
   auto const& jstr = get<Object const>(jnames.at("values"));
   auto strbuf = ArrayInterface<1>(jstr);
   CHECK_EQ(strbuf.type, ArrayInterfaceHandler::kI1);
-
+  CHECK_EQ(offset.type, ArrayInterfaceHandler::kI4);
   auto names = enc::CatStrArrayView{
       common::Span{static_cast<std::int32_t const*>(offset.data), offset.Shape<0>()},
       common::Span<std::int8_t const>{reinterpret_cast<std::int8_t const*>(strbuf.data), strbuf.n}};
diff --git a/tests/python/test_with_polars.py b/tests/python/test_with_polars.py
index b687861a3918..4afcc06fd792 100644
--- a/tests/python/test_with_polars.py
+++ b/tests/python/test_with_polars.py
@@ -138,9 +138,43 @@ def test_regressor() -> None:
 def test_categorical() -> None:
     import polars as pl
 
+    cats = ["aa", "cc", "bb", "ee", "ee"]
     df = pl.DataFrame(
-        {"f0": [1, 2, 3], "b": ["a", "b", "c"]},
-        schema=[("a", pl.Int64()), ("b", pl.Categorical())],
+        {"f0": [1, 3, 2, 4, 4], "f1": cats},
+        schema=[("f0", pl.Int64()), ("f1", pl.Categorical(ordering="lexical"))],
     )
-    with pytest.raises(NotImplementedError, match="Categorical feature"):
-        xgb.DMatrix(df, enable_categorical=True)
+    with pytest.raises(ValueError, match="enable_categorical"):
+        xgb.DMatrix(df)
+
+    data = xgb.DMatrix(df, enable_categorical=True)
+    categories = data.get_categories()
+    assert categories is not None
+    assert categories["f0"] is None
+    assert categories["f1"].to_pylist() == cats[:4]
+
+    df = pl.DataFrame(
+        {"f0": [1, 3, 2, 4, 4], "f1": cats},
+        schema=[("f0", pl.Int64()), ("f1", pl.Enum(cats[:4]))],
+    )
+    data = xgb.DMatrix(df, enable_categorical=True)
+    categories = data.get_categories()
+    assert categories is not None
+    assert categories["f0"] is None
+    assert categories["f1"].to_pylist() == cats[:4]
+
+    rng = np.random.default_rng(2025)
+    y = rng.normal(size=(df.shape[0]))
+    Xy = xgb.QuantileDMatrix(df, y, enable_categorical=True)
+    booster = xgb.train({}, Xy, num_boost_round=8)
+    predt_0 = booster.inplace_predict(df)
+
+    df_rev = pl.DataFrame(
+        {"f0": [1, 3, 2, 4, 4], "f1": cats},
+        schema=[("f0", pl.Int64()), ("f1", pl.Enum(cats[:4][::-1]))],
+    )
+    predt_1 = booster.inplace_predict(df_rev)
+    assert (
+        df["f1"].cat.get_categories().to_list()
+        != df_rev["f1"].cat.get_categories().to_list()
+    )
+    np.testing.assert_allclose(predt_0, predt_1)

From d603953f8b66b4e81c2fa102f1dd8ad6edffc108 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 17 Jul 2025 16:05:47 +0800
Subject: [PATCH 102/224] [doc] Reuse cached artifacts. (#11560)

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 doc/conf.py          | 23 +++++++++++++++++++----
 doc/contrib/docs.rst |  8 ++++++--
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 61f75c61e43b..9927edbe4170 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -124,10 +124,18 @@ def try_fetch_jvm_doc(branch: str) -> bool:
         """
         try:
             local_jvm_docs = os.environ.get("XGBOOST_JVM_DOCS", None)
+            url = f"{S3_BUCKET}/{branch}/{commit}/{branch}.tar.bz2"
             if local_jvm_docs is not None:
-                filename = os.path.expanduser(local_jvm_docs)
+                local_jvm_docs = os.path.expanduser(local_jvm_docs)
+
+            if local_jvm_docs is not None and os.path.exists(local_jvm_docs):
+                # Reuse an existing tarball.
+                filename = local_jvm_docs
+            elif local_jvm_docs is not None:
+                # Download to local_jvm_docs for future reuse.
+                filename, _ = urllib.request.urlretrieve(url, filename=local_jvm_docs)
+                print(f"Finished: {url} -> {filename}")
             else:
-                url = f"{S3_BUCKET}/{branch}/{commit}/{branch}.tar.bz2"
                 filename, _ = urllib.request.urlretrieve(url)
                 print(f"Finished: {url} -> {filename}")
             if not os.path.exists(TMP_DIR):
@@ -161,10 +169,17 @@ def download_r_docs() -> None:
     def try_fetch_r_doc(branch: str) -> bool:
         try:
             local_r_docs = os.environ.get("XGBOOST_R_DOCS", None)
+            url = f"{S3_BUCKET}/{branch}/{commit}/r-docs-{branch}.tar.bz2"
             if local_r_docs is not None:
-                filename = os.path.expanduser(local_r_docs)
+                local_r_docs = os.path.expanduser(local_r_docs)
+
+            if local_r_docs is not None and os.path.exists(local_r_docs):
+                # Reuse an existing tarball.
+                filename = local_r_docs
+            elif local_r_docs is not None:
+                filename, _ = urllib.request.urlretrieve(url, filename=local_r_docs)
+                print(f"Finished: {url} -> {filename}")
             else:
-                url = f"{S3_BUCKET}/{branch}/{commit}/r-docs-{branch}.tar.bz2"
                 filename, _ = urllib.request.urlretrieve(url)
                 print(f"Finished: {url} -> {filename}")
 
diff --git a/doc/contrib/docs.rst b/doc/contrib/docs.rst
index b0ef7d0a5fca..c850c956626d 100644
--- a/doc/contrib/docs.rst
+++ b/doc/contrib/docs.rst
@@ -107,8 +107,12 @@ build directory. Following is a list of environment variables used by the fetche
 
  - ``READTHEDOCS``: Read the docs flag. Build the full documentation site including R, JVM and
    C doc when set to ``True`` (case sensitive).
- - ``XGBOOST_R_DOCS``: Local path for pre-built R document, used for development.
- - ``XGBOOST_JVM_DOCS``: Local path for pre-built JVM document, used for development.
+ - ``XGBOOST_R_DOCS``: Local path for pre-built R document, used for development. If it
+   points to a file that doesn't exist, the configuration script will download the
+   packaged document to that path for future reuse.
+ - ``XGBOOST_JVM_DOCS``: Local path for pre-built JVM document, used for
+   development. Similar to the R docs environment variable when it points to a non-existent
+   file.
 
 As of writing, RTD doesn't provide any facility to be embedded as a GitHub action but we
 need a way to specify the dependency between the CI pipelines and the document build in

From 29ada72a858aa133621c40727960f8f3b0384f9b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 18 Jul 2025 00:52:42 +0800
Subject: [PATCH 103/224] Drop the deprecated binary format. (#11307)

- Drop support for the deprecated binary format.
- Add compatibility tests for categorical features.
- Add compatibility tests for AFT survival training.
- Use the same set of models for Python and R tests.
---
 R-package/tests/testthat/test_io.R            |   8 +-
 .../tests/testthat/test_model_compatibility.R | 165 ++++++----
 demo/CLI/binary_classification/README.md      |   4 +-
 demo/CLI/binary_classification/runexp.sh      |   8 +-
 doc/tutorials/saving_model.rst                |  64 ++--
 include/xgboost/c_api.h                       |  22 +-
 include/xgboost/gbm.h                         |  11 +-
 include/xgboost/learner.h                     |   5 +-
 include/xgboost/model.h                       |  13 +-
 include/xgboost/tree_model.h                  |  11 -
 .../dmlc/xgboost4j/java/BoosterImplTest.java  |   2 -
 ops/script/lint_python.py                     |   4 +
 python-package/xgboost/testing/updater.py     |   6 -
 src/c_api/c_api.cc                            |  86 +++---
 src/cli_main.cc                               |  45 ++-
 src/gbm/gblinear.cc                           |   7 -
 src/gbm/gblinear_model.cc                     |   5 +-
 src/gbm/gbtree.cc                             |  14 -
 src/gbm/gbtree.h                              |   5 -
 src/gbm/gbtree_model.cc                       |  69 +----
 src/gbm/gbtree_model.h                        |   9 +-
 src/learner.cc                                | 250 ++--------------
 src/tree/tree_model.cc                        |  70 -----
 tests/cpp/common/test_json.cc                 |   7 +
 tests/python/generate_models.py               | 281 +++++++++++-------
 tests/python/test_basic_models.py             |   6 -
 tests/python/test_cli.py                      |   2 +-
 tests/python/test_demos.py                    |   2 +-
 tests/python/test_model_compatibility.py      | 187 +++++++-----
 tests/python/test_model_io.py                 |  19 +-
 30 files changed, 578 insertions(+), 809 deletions(-)

diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R
index 722f278b3f7a..980c400999d9 100644
--- a/R-package/tests/testthat/test_io.R
+++ b/R-package/tests/testthat/test_io.R
@@ -18,16 +18,14 @@ test_that("load/save raw works", {
 
   json_bytes <- xgb.save.raw(booster, raw_format = "json")
   ubj_bytes <- xgb.save.raw(booster, raw_format = "ubj")
-  old_bytes <- xgb.save.raw(booster, raw_format = "deprecated")
 
   from_json <- xgb.load.raw(json_bytes)
   from_ubj <- xgb.load.raw(ubj_bytes)
 
-  json2old <- xgb.save.raw(from_json, raw_format = "deprecated")
-  ubj2old <- xgb.save.raw(from_ubj, raw_format = "deprecated")
+  json2ubj <- xgb.save.raw(from_json, raw_format = "ubj")
+  ubj2ubj <- xgb.save.raw(from_ubj, raw_format = "ubj")
 
-  expect_equal(json2old, ubj2old)
-  expect_equal(json2old, old_bytes)
+  expect_equal(json2ubj, ubj2ubj)
 })
 
 test_that("saveRDS preserves C and R attributes", {
diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R
index 9bab6e0c91a7..97bf757cf221 100644
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -1,7 +1,7 @@
 context("Models from previous versions of XGBoost can be loaded")
 
 metadata <- list(
-  kRounds = 2,
+  kRounds = 4,
   kRows = 1000,
   kCols = 4,
   kForests = 2,
@@ -10,87 +10,130 @@ metadata <- list(
 )
 
 run_model_param_check <- function(config) {
-  testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
-  testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
+  testthat::expect_equal(config$learner$learner_model_param$num_feature, "4")
+  testthat::expect_equal(config$learner$learner_train_param$booster, "gbtree")
+}
+
+get_n_rounds <- function(model_file) {
+  is_10 <- grepl("1.0.0rc1", model_file, fixed = TRUE)
+  if (is_10) {
+    2
+  } else {
+    metadata$kRounds
+  }
 }
 
 get_num_tree <- function(booster) {
   dump <- xgb.dump(booster)
-  m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
+  m <- regexec("booster\\[[0-9]+\\]", dump, perl = TRUE)
   m <- regmatches(dump, m)
-  num_tree <- Reduce('+', lapply(m, length))
-  return(num_tree)
+  num_tree <- Reduce("+", lapply(m, length))
+  num_tree
 }
 
-run_booster_check <- function(booster, name) {
+run_booster_check <- function(booster, model_file) {
   config <- xgb.config(booster)
   run_model_param_check(config)
-  if (name == 'cls') {
-    testthat::expect_equal(get_num_tree(booster),
-                           metadata$kForests * metadata$kRounds * metadata$kClasses)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'multi:softmax')
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class),
-                           metadata$kClasses)
-  } else if (name == 'logitraw') {
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logitraw')
-  } else if (name == 'logit') {
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'binary:logistic')
-  } else if (name == 'ltr') {
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg')
+  is_model <- function(typ) {
+    grepl(typ, model_file, fixed = TRUE)
+  }
+  n_rounds <- get_n_rounds(model_file = model_file)
+  if (is_model("cls")) {
+    testthat::expect_equal(
+      get_num_tree(booster), metadata$kForests * n_rounds * metadata$kClasses
+    )
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$base_score), 0.5
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "multi:softmax"
+    )
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$num_class),
+      metadata$kClasses
+    )
+  } else if (is_model("logitraw")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$num_class), 0
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "binary:logitraw"
+    )
+  } else if (is_model("logit")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$num_class), 0
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "binary:logistic"
+    )
+  } else if (is_model("ltr")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "rank:ndcg"
+    )
+  } else if (is_model("aft")) {
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "survival:aft"
+    )
   } else {
-    testthat::expect_equal(name, 'reg')
-    testthat::expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds)
-    testthat::expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5)
-    testthat::expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror')
+    testthat::expect_true(is_model("reg"))
+    testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
+    testthat::expect_equal(
+      as.numeric(config$learner$learner_model_param$base_score), 0.5
+    )
+    testthat::expect_equal(
+      config$learner$learner_train_param$objective, "reg:squarederror"
+    )
   }
 }
 
 test_that("Models from previous versions of XGBoost can be loaded", {
-  bucket <- 'xgboost-ci-jenkins-artifacts'
-  region <- 'us-west-2'
-  file_name <- 'xgboost_r_model_compatibility_test.zip'
+  bucket <- "xgboost-ci-jenkins-artifacts"
+  region <- "us-west-2"
+  file_name <- "xgboost_model_compatibility_tests-3.0.2.zip"
   zipfile <- tempfile(fileext = ".zip")
   extract_dir <- tempdir()
-  download.file(paste('https://', bucket, '.s3-', region, '.amazonaws.com/', file_name, sep = ''),
-                destfile = zipfile, mode = 'wb', quiet = TRUE)
+  result <- tryCatch(
+    {
+      download.file(
+        paste(
+          "https://", bucket, ".s3-", region, ".amazonaws.com/", file_name,
+          sep = ""
+        ),
+        destfile = zipfile, mode = "wb", quiet = TRUE
+      )
+      zipfile
+    },
+    error = function(e) {
+      print(e)
+      NA_character_
+    }
+  )
+  if (is.na(result)) {
+    print("Failed to download old models.")
+    return()
+  }
+
   unzip(zipfile, exdir = extract_dir, overwrite = TRUE)
-  model_dir <- file.path(extract_dir, 'models')
+  model_dir <- file.path(extract_dir, "models")
 
-  pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4), nthread = 2)
+  pred_data <- xgb.DMatrix(
+    matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4),
+    nthread = 2
+  )
 
   lapply(list.files(model_dir), function(x) {
     model_file <- file.path(model_dir, x)
-    m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
-    m <- regmatches(model_file, m)[[1]]
-    model_xgb_ver <- m[2]
-    name <- m[3]
-    is_rds <- endsWith(model_file, '.rds')
-    is_json <- endsWith(model_file, '.json')
-    # TODO: update this test for new RDS format
-    if (is_rds) {
-      return(NULL)
-    }
-    # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
-    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
-      booster <- readRDS(model_file)
-      expect_warning(predict(booster, newdata = pred_data))
-      booster <- readRDS(model_file)
-      expect_warning(run_booster_check(booster, name))
-    } else {
-      if (is_rds) {
-        booster <- readRDS(model_file)
-      } else {
-        booster <- xgb.load(model_file)
-        xgb.model.parameters(booster) <- list(nthread = 2)
-      }
-      predict(booster, newdata = pred_data)
-      run_booster_check(booster, name)
+    is_skl <- grepl("scikit", model_file, fixed = TRUE)
+    if (is_skl) {
+      return()
     }
+    booster <- xgb.load(model_file)
+    xgb.model.parameters(booster) <- list(nthread = 2)
+    predict(booster, newdata = pred_data)
+    run_booster_check(booster, model_file)
   })
 })
diff --git a/demo/CLI/binary_classification/README.md b/demo/CLI/binary_classification/README.md
index 7fe2120ece0a..fec41b2bfe33 100644
--- a/demo/CLI/binary_classification/README.md
+++ b/demo/CLI/binary_classification/README.md
@@ -150,9 +150,9 @@ xgboost also supports monitoring multiple metrics, suppose we also want to monit
 If you want to save model every two round, simply set save_period=2. You will find 0002.model in the current folder. If you want to change the output folder of models, add model_dir=foldername. By default xgboost saves the model of last round.
 
 #### Continue from Existing Model
-If you want to continue boosting from existing model, say 0002.model, use
+If you want to continue boosting from existing model, say 0002.ubj, use
 ```
-../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.model
+../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.ubj
 ```
 xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
 #### Use Multi-Threading
diff --git a/demo/CLI/binary_classification/runexp.sh b/demo/CLI/binary_classification/runexp.sh
index 4a33f0ed8b2d..8a4d248a0bdf 100755
--- a/demo/CLI/binary_classification/runexp.sh
+++ b/demo/CLI/binary_classification/runexp.sh
@@ -9,9 +9,9 @@ XGBOOST=../../../xgboost
 # training and output the models
 $XGBOOST mushroom.conf
 # output prediction task=pred
-$XGBOOST mushroom.conf task=pred model_in=0002.model
-# print the boosters of 00002.model in dump.raw.txt
-$XGBOOST mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
+$XGBOOST mushroom.conf task=pred model_in=0002.ubj
+# print the boosters of 00002.ubj in dump.raw.txt
+$XGBOOST mushroom.conf task=dump model_in=0002.ubj name_dump=dump.raw.txt
 # use the feature map in printing for better visualization
-$XGBOOST mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
+$XGBOOST mushroom.conf task=dump model_in=0002.ubj fmap=featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index 326834cd4f8d..6449135b8874 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -6,13 +6,6 @@ Since 2.1.0, the default model format for XGBoost is the UBJSON format, the opti
 enabled for serializing models to file, serializing models to buffer, and for memory
 snapshot (pickle and alike).
 
-In XGBoost 1.0.0, we introduced support of using `JSON
-<https://www.json.org/json-en.html>`_ for saving/loading XGBoost models and related
-hyper-parameters for training, aiming to replace the old binary internal format with an
-open format that can be easily reused.  Later in XGBoost 1.6.0, additional support for
-`Universal Binary JSON <https://ubjson.org/>`__ is added as an optimization for more
-efficient model IO, which is set to default in 2.1.
-
 JSON and UBJSON have the same document structure with different representations, and we
 will refer them collectively as the JSON format. This tutorial aims to share some basic
 insights into the JSON serialisation method used in XGBoost.  Without explicitly
@@ -27,41 +20,33 @@ which means inside XGBoost, there are 2 distinct parts:
 1. The model consisting of trees and
 2. Hyperparameters and configurations used for building the model.
 
-If you come from Deep Learning community, then it should be
-clear to you that there are differences between the neural network structures composed of
-weights with fixed tensor operations, and the optimizers (like RMSprop) used to train them.
+If you come from the Deep Learning community, then it should be clear to you that there
+are differences between the neural network structures composed of weights with fixed
+tensor operations, and the optimizers (like RMSprop) used to train them.
 
 So when one calls ``booster.save_model`` (``xgb.save`` in R), XGBoost saves the trees,
 some model parameters like number of input columns in trained trees, and the objective
 function, which combined to represent the concept of "model" in XGBoost.  As for why are
 we saving the objective as part of model, that's because objective controls transformation
-of global bias (called ``base_score`` in XGBoost) and task-specific information.  Users
-can share this model with others for prediction, evaluation or continue the training with
-a different set of hyper-parameters etc.
+of global bias (called ``base_score`` or the intercept in XGBoost) and task-specific
+information.  Users can share this model with others for inference, evaluation or continue
+the training with a different set of hyper-parameters etc.
 
 However, this is not the end of story.  There are cases where we need to save something
 more than just the model itself.  For example, in distributed training, XGBoost performs
 checkpointing operation.  Or for some reasons, your favorite distributed computing
 framework decide to copy the model from one worker to another and continue the training in
-there.  In such cases, the serialisation output is required to contain enough information
+there. In such cases, the serialisation output is required to contain enough information
 to continue previous training without user providing any parameters again.  We consider
-such scenario as **memory snapshot** (or memory based serialisation method) and distinguish it
-with normal model IO operation. Currently, memory snapshot is used in the following places:
+such scenario as **memory snapshot** (or memory based serialisation method) and
+distinguish it with normal model IO operation. Currently, memory snapshot is used in the
+following places:
 
 * Python package: when the ``Booster`` object is pickled with the built-in ``pickle`` module.
 * R package: when the ``xgb.Booster`` object is persisted with the built-in functions ``saveRDS``
   or ``save``.
 * JVM packages: when the ``Booster`` object is serialized with the built-in functions ``saveModel``.
 
-Other language bindings are still working in progress.
-
-.. note::
-
-  The old binary format doesn't distinguish difference between model and raw memory
-  serialisation format, it's a mix of everything, which is part of the reason why we want
-  to replace it with a more robust serialisation method.  JVM Package has its own memory
-  based serialisation methods.
-
 To enable JSON format support for model IO (saving only the trees and objective), provide
 a filename with ``.json`` or ``.ubj`` as file extension, the latter is the extension for
 `Universal Binary JSON <https://ubjson.org/>`__
@@ -88,10 +73,9 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten
   JSON files that were produced by an external source may lead to undefined behaviors
   and crashes.
 
-While for memory snapshot, UBJSON is the default starting with xgboost 1.6. When loading
-the model back, XGBoost recognizes the file extensions ``.json`` and ``.ubj``, and can
-dispatch accordingly. If the extension is not specified, XGBoost tries to guess the right
-one.
+When loading the model back, XGBoost recognizes the file extensions ``.json`` and
+``.ubj``, and can dispatch accordingly. If the extension is not specified, XGBoost tries
+to guess the right one.
 
 ***************************************************************
 A note on backward compatibility of models and memory snapshots
@@ -234,7 +218,7 @@ You can load it back to the model generated by same version of XGBoost by:
 
   bst.load_config(config)
 
-This way users can study the internal representation more closely.  Please note that some
+This way users can study the internal representation more closely. Please note that some
 JSON generators make use of locale dependent floating point serialization methods, which
 is not supported by XGBoost.
 
@@ -242,10 +226,10 @@ is not supported by XGBoost.
 Difference between saving model and dumping model
 *************************************************
 
-XGBoost has a function called ``dump_model`` in Booster object, which lets you to export
-the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz).  The primary
-use case for it is for model interpretation or visualization, and is not supposed to be
-loaded back to XGBoost.  The JSON version has a `schema
+XGBoost has a function called ``dump_model`` in the Booster class, which lets you to
+export the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz).  The
+primary use case for it is for model interpretation and visualization, and is not supposed
+to be loaded back to XGBoost.  The JSON version has a `schema
 <https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`__.  See next section for
 more info.
 
@@ -263,3 +247,15 @@ array.
 
 .. include:: ../model.schema
    :code: json
+
+
+*************
+Brief History
+*************
+
+- The JSON format was introduced in 1.0, aiming to replace the now removed old binary
+  internal format with an open format that can be easily reused
+- Later in XGBoost 1.6.0, additional support for Universal Binary JSON was introduced as
+  an optimization for more efficient model IO.
+- UBJSON has been set to default in 2.1.
+- The old binary format was removed in 3.1.
\ No newline at end of file
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 1effda692bf7..0cdb7b987f41 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1320,21 +1320,23 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *d
  * @{
  */
 
-/*!
- * \brief Load model from existing file
+/**
+ * @brief Load the model from an existing file
  *
- * \param handle handle
- * \param fname File URI or file name. The string must be UTF-8 encoded.
- * \return 0 when success, -1 when failure happens
+ * @param handle handle
+ * @param fname File name. The string must be UTF-8 encoded.
+ *
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
                                const char *fname);
-/*!
- * \brief Save model into existing file
+/**
+ * @brief Save the model into an existing file
  *
- * \param handle handle
- * \param fname File URI or file name. The string must be UTF-8 encoded.
- * \return 0 when success, -1 when failure happens
+ * @param handle handle
+ * @param fname File name. The string must be UTF-8 encoded.
+ *
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
                                const char *fname);
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 6c90c2a2e0bd..d7b143a61609 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -47,16 +47,7 @@ class GradientBooster : public Model, public Configurable {
    * @param cfg configurations on both training and model parameters.
    */
   virtual void Configure(Args const& cfg) = 0;
-  /*!
-   * \brief load model from stream
-   * \param fi input stream.
-   */
-  virtual void Load(dmlc::Stream* fi) = 0;
-  /*!
-   * \brief save model to stream.
-   * \param fo output stream
-   */
-  virtual void Save(dmlc::Stream* fo) const = 0;
+
   /**
    * \brief Slice a model using boosting index. The slice m:n indicates taking all trees
    *        that were fit during the boosting rounds m, (m+1), (m+2), ..., (n-1).
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 1499804c8592..43e97babf2a8 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file learner.h
  * \brief Learner interface that integrates objective, gbm and evaluation together.
  *  This is the user facing XGBoost training module.
@@ -151,9 +151,6 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   void LoadModel(Json const& in) override = 0;
   void SaveModel(Json* out) const override = 0;
 
-  virtual void LoadModel(dmlc::Stream* fi) = 0;
-  virtual void SaveModel(dmlc::Stream* fo) const = 0;
-
   /*!
    * \brief Set multiple parameters at once.
    *
diff --git a/include/xgboost/model.h b/include/xgboost/model.h
index 610c7a0f5c48..c9c045234501 100644
--- a/include/xgboost/model.h
+++ b/include/xgboost/model.h
@@ -1,15 +1,12 @@
-/*!
- * Copyright (c) 2019 by Contributors
- * \file model.h
- * \brief Defines the abstract interface for different components in XGBoost.
+/**
+ * Copyright 2019-2025, XGBoost Contributors
+ *
+ * @file model.h
+ * @brief Defines the abstract interface for different components in XGBoost.
  */
 #ifndef XGBOOST_MODEL_H_
 #define XGBOOST_MODEL_H_
 
-namespace dmlc {
-class Stream;
-}  // namespace dmlc
-
 namespace xgboost {
 
 class Json;
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 921fc5a1ebc8..7deb5f7d2f78 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -365,17 +365,6 @@ class RegTree : public Model {
     return stats_[nid];
   }
 
-  /*!
-   * \brief load model from stream
-   * \param fi input stream
-   */
-  void Load(dmlc::Stream* fi);
-  /*!
-   * \brief save model to stream
-   * \param fo output stream
-   */
-  void Save(dmlc::Stream* fo) const;
-
   void LoadModel(Json const& in) override;
   void SaveModel(Json* out) const override;
 
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index b686ddbed858..90b38d99922f 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -294,7 +294,6 @@ public void saveLoadModelWithPath() throws XGBoostError, IOException {
     Booster bst2 = XGBoost.loadModel(temp.getAbsolutePath());
     assert (Arrays.equals(bst2.toByteArray("ubj"), booster.toByteArray("ubj")));
     assert (Arrays.equals(bst2.toByteArray("json"), booster.toByteArray("json")));
-    assert (Arrays.equals(bst2.toByteArray("deprecated"), booster.toByteArray("deprecated")));
     float[][] predicts2 = bst2.predict(testMat, true, 0);
     TestCase.assertTrue(eval.eval(predicts2, testMat) < 0.1f);
   }
@@ -327,7 +326,6 @@ public void saveLoadModelWithFeaturesWithPath() throws XGBoostError, IOException
     Booster bst2 = XGBoost.loadModel(temp.getAbsolutePath());
     assert (Arrays.equals(bst2.toByteArray("ubj"), booster.toByteArray("ubj")));
     assert (Arrays.equals(bst2.toByteArray("json"), booster.toByteArray("json")));
-    assert (Arrays.equals(bst2.toByteArray("deprecated"), booster.toByteArray("deprecated")));
     float[][] predicts2 = bst2.predict(testMat, true, 0);
     TestCase.assertTrue(eval.eval(predicts2, testMat) < 0.1f);
   }
diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py
index dd9740062f26..59062f69f1c2 100644
--- a/ops/script/lint_python.py
+++ b/ops/script/lint_python.py
@@ -17,6 +17,8 @@ class LintersPaths:
         # core
         "python-package/",
         # tests
+        "tests/python/generate_models.py",
+        "tests/python/test_model_compatibility.py",
         "tests/python/test_config.py",
         "tests/python/test_callback.py",
         "tests/python/test_collective.py",
@@ -98,6 +100,8 @@ class LintersPaths:
         # core
         "python-package/",
         # tests
+        "tests/python/generate_models.py",
+        "tests/python/test_model_compatibility.py",
         "tests/python/test_collective.py",
         "tests/python/test_demos.py",
         "tests/python/test_data_iterator.py",
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index f49edd1e3983..d23e9dbcd9fd 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -704,12 +704,6 @@ def get_score(config: Dict) -> float:
 
     assert get_score(config_0) == get_score(config_1)
 
-    with pytest.warns(Warning, match="Model format is default to UBJSON"):
-        raw_booster = booster_1.save_raw(raw_format="deprecated")
-    booster_2 = xgb.Booster(model_file=raw_booster)
-    config_2 = json.loads(booster_2.save_config())
-    assert get_score(config_1) == get_score(config_2)
-
     raw_booster = booster_1.save_raw(raw_format="ubj")
     booster_2 = xgb.Booster(model_file=raw_booster)
     config_2 = json.loads(booster_2.save_config())
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 2ccd905af318..e5b07bbe9295 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1400,38 +1400,65 @@ XGB_DLL int XGBoosterPredictFromCUDAColumnar(BoosterHandle handle, char const *,
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
 
-XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
+namespace {
+template <typename Buffer, typename Iter = typename Buffer::const_iterator>
+Json DispatchModelType(Buffer const &buffer, StringView ext, bool warn) {
+  auto first_non_space = [&](Iter beg, Iter end) {
+    for (auto i = beg; i != end; ++i) {
+      if (!std::isspace(*i)) {
+        return i;
+      }
+    }
+    return end;
+  };
+
+  Json model;
+  auto it = first_non_space(buffer.cbegin() + 1, buffer.cend());
+  if (it != buffer.cend() && *it == '"') {
+    if (warn) {
+      LOG(WARNING) << "Unknown file format: `" << ext << "`. Using JSON (`json`) as a guess.";
+    }
+    model = Json::Load(StringView{buffer.data(), buffer.size()});
+  } else if (it != buffer.cend() && std::isalpha(*it)) {
+    if (warn) {
+      LOG(WARNING) << "Unknown file format: `" << ext << "`. Using UBJSON (`ubj`) as a guess.";
+    }
+    model = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
+  } else {
+    LOG(FATAL) << "Invalid model format. Expecting UBJSON (`ubj`) or JSON (`json`), got `" << ext
+               << "`";
+  }
+  return model;
+}
+}  // namespace
+
+XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char *fname) {
   API_BEGIN();
   CHECK_HANDLE();
   xgboost_CHECK_C_ARG_PTR(fname);
   auto read_file = [&]() {
     auto str = common::LoadSequentialFile(fname);
-    CHECK_GE(str.size(), 3);  // "{}\0"
+    CHECK_GE(str.size(), 2);  // "{}"
     CHECK_EQ(str[0], '{');
     return str;
   };
-  if (common::FileExtension(fname) == "json") {
+  auto ext = common::FileExtension(fname);
+  if (ext == "json") {
     auto buffer = read_file();
     Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
-    static_cast<Learner*>(handle)->LoadModel(in);
-  } else if (common::FileExtension(fname) == "ubj") {
+    static_cast<Learner *>(handle)->LoadModel(in);
+  } else if (ext == "ubj") {
     auto buffer = read_file();
     Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
     static_cast<Learner *>(handle)->LoadModel(in);
   } else {
-    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
-    static_cast<Learner*>(handle)->LoadModel(fi.get());
+    auto buffer = read_file();
+    auto in = DispatchModelType(buffer, ext, true);
+    static_cast<Learner *>(handle)->LoadModel(in);
   }
   API_END();
 }
 
-namespace {
-void WarnOldModel() {
-  LOG(WARNING) << "Saving into deprecated binary model format, please consider using `json` or "
-                  "`ubj`. Model format is default to UBJSON in XGBoost 2.1 if not specified.";
-}
-}  // anonymous namespace
-
 XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char *fname) {
   API_BEGIN();
   CHECK_HANDLE();
@@ -1447,17 +1474,14 @@ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char *fname) {
     Json::Dump(out, &str, mode);
     fo->Write(str.data(), str.size());
   };
-  if (common::FileExtension(fname) == "json") {
+  auto ext = common::FileExtension(fname);
+  if (ext == "json") {
     save_json(std::ios::out);
-  } else if (common::FileExtension(fname) == "ubj") {
+  } else if (ext == "ubj") {
     save_json(std::ios::binary);
-  } else if (common::FileExtension(fname) == "deprecated") {
-    WarnOldModel();
-    auto *bst = static_cast<Learner *>(handle);
-    bst->SaveModel(fo.get());
   } else {
-    LOG(WARNING) << "Saving model in the UBJSON format as default.  You can use file extension:"
-                    " `json`, `ubj` or `deprecated` to choose between formats.";
+    LOG(WARNING) << "Saving model in the UBJSON format as default.  You can use a file extension:"
+                    " `json` or `ubj` to choose between formats.";
     save_json(std::ios::binary);
   }
   API_END();
@@ -1468,9 +1492,11 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, const void *buf,
   API_BEGIN();
   CHECK_HANDLE();
   xgboost_CHECK_C_ARG_PTR(buf);
-
+  auto buffer = common::Span<char const>{static_cast<char const *>(buf), len};
+  // Don't warn, we have to guess the format with buffer input.
+  auto in = DispatchModelType(buffer, "", false);
   common::MemoryFixSizeBuffer fs((void *)buf, len);  // NOLINT(*)
-  static_cast<Learner *>(handle)->LoadModel(&fs);
+  static_cast<Learner *>(handle)->LoadModel(in);
   API_END();
 }
 
@@ -1503,17 +1529,9 @@ XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_co
     save_json(std::ios::out);
   } else if (format == "ubj") {
     save_json(std::ios::binary);
-  } else if (format == "deprecated") {
-    WarnOldModel();
-    auto &raw_str = learner->GetThreadLocal().ret_str;
-    raw_str.clear();
-    common::MemoryBufferStream fo(&raw_str);
-    learner->SaveModel(&fo);
-
-    *out_dptr = dmlc::BeginPtr(raw_str);
-    *out_len = static_cast<xgboost::bst_ulong>(raw_str.size());
   } else {
-    LOG(FATAL) << "Unknown format: `" << format << "`";
+    LOG(FATAL) << "Unknown model format: `" << format
+               << "`. Expecting UBJSON (`ubj`) or JSON (`json`).";
   }
 
   API_END();
diff --git a/src/cli_main.cc b/src/cli_main.cc
index 1c388cf845c2..5750121cba91 100644
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -232,7 +232,7 @@ class CLI {
       if (param_.save_period != 0 && (i + 1) % param_.save_period == 0) {
         std::ostringstream os;
         os << param_.model_dir << '/' << std::setfill('0') << std::setw(4)
-           << i + 1 << ".model";
+           << i + 1 << ".ubj";
         this->SaveModel(os.str(), learner_.get());
       }
 
@@ -246,7 +246,7 @@ class CLI {
       std::ostringstream os;
       if (param_.model_out == CLIParam::kNull) {
         os << param_.model_dir << '/' << std::setfill('0') << std::setw(4)
-           << param_.num_round << ".model";
+           << param_.num_round << ".ubj";
       } else {
         os << param_.model_out;
       }
@@ -329,29 +329,46 @@ class CLI {
   }
 
   void LoadModel(std::string const& path, Learner* learner) const {
-    if (common::FileExtension(path) == "json") {
-      auto buffer = common::LoadSequentialFile(path);
-      CHECK_GT(buffer.size(), 2);
-      CHECK_EQ(buffer[0], '{');
-      Json in{Json::Load({buffer.data(), buffer.size()})};
+    auto ext = common::FileExtension(path);
+    auto read_file = [&]() {
+      auto str = common::LoadSequentialFile(path);
+      CHECK_GE(str.size(), 3);  // "{}\0"
+      CHECK_EQ(str[0], '{');
+      return str;
+    };
+
+    if (ext == "json") {
+      auto buffer = read_file();
+      Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
+      learner->LoadModel(in);
+    } else if (ext == "ubj") {
+      auto buffer = read_file();
+      Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
       learner->LoadModel(in);
     } else {
-      std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r"));
-      learner->LoadModel(fi.get());
+      LOG(FATAL) << "Unknown model format:" << path
+                 << ", expecting either UBJSON (`ubj`) or JSON (`json`).";
     }
   }
 
   void SaveModel(std::string const& path, Learner* learner) const {
     learner->Configure();
     std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(path.c_str(), "w"));
-    if (common::FileExtension(path) == "json") {
+    auto ext = common::FileExtension(path);
+    auto save_json = [&](std::ios::openmode mode) {
       Json out{Object()};
       learner->SaveModel(&out);
-      std::string str;
-      Json::Dump(out, &str);
-      fo->Write(str.c_str(), str.size());
+      std::vector<char> str;
+      Json::Dump(out, &str, mode);
+      fo->Write(str.data(), str.size());
+    };
+
+    if (ext == "json") {
+      save_json(std::ios::out);
+    } else if (ext == "ubj") {
+      save_json(std::ios::binary);
     } else {
-      learner->SaveModel(fo.get());
+      LOG(FATAL) << "Unknown model format:" << path << ", expecting either json or ubj.";
     }
   }
 
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 3099fed69856..ecca3f3267e4 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -92,13 +92,6 @@ class GBLinear : public GradientBooster {
 
   bool ModelFitted() const override { return BoostedRounds() != 0; }
 
-  void Load(dmlc::Stream*) override {
-    LOG(FATAL) << "The deprecated binary model has been removed";
-  }
-  void Save(dmlc::Stream*) const override {
-    LOG(FATAL) << "The deprecated binary model has been removed";
-  }
-
   void SaveModel(Json* p_out) const override {
     auto& out = *p_out;
     out["name"] = String{"gblinear"};
diff --git a/src/gbm/gblinear_model.cc b/src/gbm/gblinear_model.cc
index 5e6f5dda9a1f..0be4b5a2914f 100644
--- a/src/gbm/gblinear_model.cc
+++ b/src/gbm/gblinear_model.cc
@@ -1,9 +1,8 @@
-/*!
- * Copyright 2019-2022 by Contributors
+/**
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <algorithm>
 #include <utility>
-#include <limits>
 #include "xgboost/json.h"
 #include "gblinear_model.h"
 
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index d63bbc9a3583..8ca3deb9a40f 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -698,20 +698,6 @@ class Dart : public GBTree {
     }
   }
 
-  void Load(dmlc::Stream* fi) override {
-    GBTree::Load(fi);
-    weight_drop_.resize(model_.param.num_trees);
-    if (model_.param.num_trees != 0) {
-      fi->Read(&weight_drop_);
-    }
-  }
-  void Save(dmlc::Stream* fo) const override {
-    GBTree::Save(fo);
-    if (weight_drop_.size() != 0) {
-      fo->Write(weight_drop_);
-    }
-  }
-
   void LoadConfig(Json const& in) override {
     CHECK_EQ(get<String>(in["name"]), "dart");
     auto const& gbtree = in["gbtree"];
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 180c3b3988b5..8e7f5e135217 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -187,11 +187,6 @@ class GBTree : public GradientBooster {
 
   [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
 
-  void Load(dmlc::Stream* fi) override { model_.Load(fi); }
-  void Save(dmlc::Stream* fo) const override {
-    model_.Save(fo);
-  }
-
   void LoadConfig(Json const& in) override;
   void SaveConfig(Json* p_out) const override;
 
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index c94c6525fea2..59a81b870de5 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -1,17 +1,14 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include "gbtree_model.h"
 
-#include <algorithm>                    // for transform, max_element
-#include <cstddef>                      // for size_t
-#include <numeric>                      // for partial_sum
-#include <ostream>                      // for operator<<, basic_ostream
-#include <utility>                      // for move, pair
+#include <algorithm>  // for transform, max_element
+#include <cstddef>    // for size_t
+#include <numeric>    // for partial_sum
+#include <utility>    // for move, pair
 
 #include "../common/threading_utils.h"  // for ParallelFor
-#include "dmlc/base.h"                  // for BeginPtr
-#include "dmlc/io.h"                    // for Stream
 #include "xgboost/context.h"            // for Context
 #include "xgboost/json.h"               // for Json, get, Integer, Array, FromJson, ToJson, Json...
 #include "xgboost/learner.h"            // for LearnerModelParam
@@ -50,62 +47,6 @@ void Validate(GBTreeModel const& model) {
 }
 }  // namespace
 
-void GBTreeModel::Save(dmlc::Stream* fo) const {
-  CHECK_EQ(param.num_trees, static_cast<int32_t>(trees.size()));
-
-  if (DMLC_IO_NO_ENDIAN_SWAP) {
-    fo->Write(&param, sizeof(param));
-  } else {
-    auto x = param.ByteSwap();
-    fo->Write(&x, sizeof(x));
-  }
-  for (const auto & tree : trees) {
-    tree->Save(fo);
-  }
-  if (tree_info.size() != 0) {
-    if (DMLC_IO_NO_ENDIAN_SWAP) {
-      fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
-    } else {
-      for (const auto& e : tree_info) {
-        auto x = e;
-        dmlc::ByteSwap(&x, sizeof(x), 1);
-        fo->Write(&x, sizeof(x));
-      }
-    }
-  }
-}
-
-void GBTreeModel::Load(dmlc::Stream* fi) {
-  CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param))
-      << "GBTree: invalid model file";
-  if (!DMLC_IO_NO_ENDIAN_SWAP) {
-    param = param.ByteSwap();
-  }
-  trees.clear();
-  trees_to_update.clear();
-  for (int32_t i = 0; i < param.num_trees; ++i) {
-    std::unique_ptr<RegTree> ptr(new RegTree());
-    ptr->Load(fi);
-    trees.push_back(std::move(ptr));
-  }
-  tree_info.resize(param.num_trees);
-  if (param.num_trees != 0) {
-    if (DMLC_IO_NO_ENDIAN_SWAP) {
-      CHECK_EQ(
-          fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
-          sizeof(int32_t) * param.num_trees);
-    } else {
-      for (auto& info : tree_info) {
-        CHECK_EQ(fi->Read(&info, sizeof(int32_t)), sizeof(int32_t));
-        dmlc::ByteSwap(&info, sizeof(info), 1);
-      }
-    }
-  }
-
-  MakeIndptr(this);
-  Validate(*this);
-}
-
 void GBTreeModel::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index 7d7893fb3391..e686272c3320 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -60,10 +60,8 @@ struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
 
   // declare parameters, only declare those that need to be set.
   DMLC_DECLARE_PARAMETER(GBTreeModelParam) {
-    DMLC_DECLARE_FIELD(num_trees)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("Number of features used for training and prediction.");
+    DMLC_DECLARE_FIELD(num_trees).set_lower_bound(0).set_default(0).describe(
+        "Number of trees for the entire booster model.");
     DMLC_DECLARE_FIELD(num_parallel_tree)
         .set_default(1)
         .set_lower_bound(1)
@@ -108,9 +106,6 @@ struct GBTreeModel : public Model {
     }
   }
 
-  void Load(dmlc::Stream* fi);
-  void Save(dmlc::Stream* fo) const;
-
   void SaveModel(Json* p_out) const override;
   void LoadModel(Json const& p_out) override;
 
diff --git a/src/learner.cc b/src/learner.cc
index c99d95edce9a..b4daa95222de 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2024, XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file learner.cc
  * \brief Implementation of learning algorithm.
  * \author Tianqi Chen
@@ -35,7 +35,6 @@
 #include "collective/communicator-inl.h"  // for Allreduce, Broadcast, GetRank, IsDistributed
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
-#include "common/common.h"                // for ToString, Split
 #include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
@@ -849,11 +848,6 @@ class LearnerConfiguration : public Learner {
 std::string const LearnerConfiguration::kEvalMetric {"eval_metric"};  // NOLINT
 
 class LearnerIO : public LearnerConfiguration {
- private:
-  // Used to identify the offset of JSON string when
-  // Will be removed once JSON takes over.  Right now we still loads some RDS files from R.
-  std::string const serialisation_header_ { u8"CONFIG-offset:" };
-
  protected:
   void ClearCaches() { this->prediction_container_ = PredictionContainer{}; }
 
@@ -866,7 +860,7 @@ class LearnerIO : public LearnerConfiguration {
     if (std::get<0>(version) == 1 && std::get<1>(version) < 6) {
       LOG(WARNING)
           << "Found JSON model saved before XGBoost 1.6, please save the model using current "
-             "version again. The support for old JSON model will be discontinued in XGBoost 2.3.";
+             "version again. The support for old JSON model will be discontinued in XGBoost 3.2";
     }
 
     auto const& learner = get<Object>(in["learner"]);
@@ -948,192 +942,6 @@ class LearnerIO : public LearnerConfiguration {
     }
   }
 
-  // About to be deprecated by JSON format
-  void LoadModel(dmlc::Stream* fi) override {
-    ctx_.UpdateAllowUnknown(Args{});
-    tparam_.Init(std::vector<std::pair<std::string, std::string>>{});
-    // TODO(tqchen) mark deprecation of old format.
-    common::PeekableInStream fp(fi);
-
-    // backward compatible header check.
-    std::string header;
-    header.resize(4);
-    if (fp.PeekRead(&header[0], 4) == 4) {
-      CHECK_NE(header, "bs64")
-          << "Base64 format is no longer supported in brick.";
-      if (header == "binf") {
-        CHECK_EQ(fp.Read(&header[0], 4), 4U);
-      }
-    }
-
-    // FIXME(jiamingy): Move this out of learner after the old binary model is remove.
-    auto first_non_space = [&](std::string::const_iterator beg, std::string::const_iterator end) {
-      for (auto i = beg; i != end; ++i) {
-        if (!std::isspace(*i)) {
-          return i;
-        }
-      }
-      return end;
-    };
-
-    if (header[0] == '{') {  // Dispatch to JSON
-      auto buffer = common::ReadAll(fi, &fp);
-      Json model;
-      auto it = first_non_space(buffer.cbegin() + 1, buffer.cend());
-      if (it != buffer.cend() && *it == '"') {
-        model = Json::Load(StringView{buffer});
-      } else if (it != buffer.cend() && std::isalpha(*it)) {
-        model = Json::Load(StringView{buffer}, std::ios::binary);
-      } else {
-        LOG(FATAL) << "Invalid model format";
-      }
-      this->LoadModel(model);
-      return;
-    }
-
-    // use the peekable reader.
-    fi = &fp;
-    // read parameter
-    CHECK_EQ(fi->Read(&mparam_, sizeof(mparam_)), sizeof(mparam_))
-        << "BoostLearner: wrong model format";
-    if (!DMLC_IO_NO_ENDIAN_SWAP) {
-      mparam_ = mparam_.ByteSwap();
-    }
-    CHECK(fi->Read(&tparam_.objective)) << "BoostLearner: wrong model format";
-    CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
-
-    obj_.reset(ObjFunction::Create(tparam_.objective, &ctx_));
-    gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_, &learner_model_param_));
-    gbm_->Load(fi);
-    if (mparam_.contain_extra_attrs != 0) {
-      std::vector<std::pair<std::string, std::string> > attr;
-      fi->Read(&attr);
-      attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
-    }
-    bool warn_old_model { false };
-    if (attributes_.find("count_poisson_max_delta_step") != attributes_.cend()) {
-      // Loading model from < 1.0.0, objective is not saved.
-      cfg_["max_delta_step"] = attributes_.at("count_poisson_max_delta_step");
-      attributes_.erase("count_poisson_max_delta_step");
-      warn_old_model = true;
-    } else {
-      warn_old_model = false;
-    }
-
-    if (mparam_.major_version < 1) {
-      // Before 1.0.0, base_score is saved as a transformed value, and there's no version
-      // attribute (saved a 0) in the saved model.
-      std::string multi{"multi:"};
-      if (!std::equal(multi.cbegin(), multi.cend(), tparam_.objective.cbegin())) {
-        HostDeviceVector<float> t;
-        t.HostVector().resize(1);
-        t.HostVector().at(0) = mparam_.base_score;
-        this->obj_->PredTransform(&t);
-        auto base_score = t.HostVector().at(0);
-        mparam_.base_score = base_score;
-      }
-      warn_old_model = true;
-    }
-
-    learner_model_param_ =
-        LearnerModelParam(&ctx_, mparam_,
-                          linalg::Tensor<float, 1>{{std::isnan(mparam_.base_score)
-                                                        ? std::numeric_limits<float>::quiet_NaN()
-                                                        : obj_->ProbToMargin(mparam_.base_score)},
-                                                   {1},
-                                                   DeviceOrd::CPU()},
-                          obj_->Task(), tparam_.multi_strategy);
-
-    if (attributes_.find("objective") != attributes_.cend()) {
-      auto obj_str = attributes_.at("objective");
-      auto j_obj = Json::Load({obj_str.c_str(), obj_str.size()});
-      obj_->LoadConfig(j_obj);
-      attributes_.erase("objective");
-    } else {
-      warn_old_model = true;
-    }
-    if (attributes_.find("metrics") != attributes_.cend()) {
-      auto metrics_str = attributes_.at("metrics");
-      std::vector<std::string> names { common::Split(metrics_str, ';') };
-      attributes_.erase("metrics");
-      for (auto const& n : names) {
-        this->SetParam(kEvalMetric, n);
-      }
-    }
-
-    if (warn_old_model) {
-      LOG(WARNING) << "Loading model from XGBoost < 1.0.0, consider saving it "
-                      "again for improved compatibility";
-    }
-
-    // Renew the version.
-    mparam_.major_version = std::get<0>(Version::Self());
-    mparam_.minor_version = std::get<1>(Version::Self());
-
-    cfg_["num_feature"] = std::to_string(mparam_.num_feature);
-
-    auto n = tparam_.__DICT__();
-    cfg_.insert(n.cbegin(), n.cend());
-
-    this->need_configuration_ = true;
-    this->ClearCaches();
-  }
-
-  // Save model into binary format.  The code is about to be deprecated by more robust
-  // JSON serialization format.
-  void SaveModel(dmlc::Stream* fo) const override {
-    this->CheckModelInitialized();
-    CHECK(!this->learner_model_param_.IsVectorLeaf())
-        << "Please use JSON/UBJ format for model serialization with multi-output models.";
-
-    LearnerModelParamLegacy mparam = mparam_;  // make a copy to potentially modify
-    std::vector<std::pair<std::string, std::string> > extra_attr;
-    mparam.contain_extra_attrs = 1;
-
-    if (!this->feature_names_.empty() || !this->feature_types_.empty()) {
-      LOG(WARNING) << "feature names and feature types are being disregarded, use JSON/UBJSON "
-                      "format instead.";
-    }
-
-    {
-      // Similar to JSON model IO, we save the objective.
-      Json j_obj { Object() };
-      obj_->SaveConfig(&j_obj);
-      std::string obj_doc;
-      Json::Dump(j_obj, &obj_doc);
-      extra_attr.emplace_back("objective", obj_doc);
-    }
-    // As of 1.0.0, JVM Package and R Package uses Save/Load model for serialization.
-    // Remove this part once they are ported to use actual serialization methods.
-    if (mparam.contain_eval_metrics != 0) {
-      std::stringstream os;
-      for (auto& ev : metrics_) {
-        os << ev->Name() << ";";
-      }
-      extra_attr.emplace_back("metrics", os.str());
-    }
-
-    std::string header {"binf"};
-    fo->Write(header.data(), 4);
-    if (DMLC_IO_NO_ENDIAN_SWAP) {
-      fo->Write(&mparam, sizeof(LearnerModelParamLegacy));
-    } else {
-      LearnerModelParamLegacy x = mparam.ByteSwap();
-      fo->Write(&x, sizeof(LearnerModelParamLegacy));
-    }
-    fo->Write(tparam_.objective);
-    fo->Write(tparam_.booster);
-    gbm_->Save(fo);
-    if (mparam.contain_extra_attrs != 0) {
-      std::map<std::string, std::string> attr(attributes_);
-      for (const auto& kv : extra_attr) {
-        attr[kv.first] = kv.second;
-      }
-      fo->Write(std::vector<std::pair<std::string, std::string>>(
-          attr.begin(), attr.end()));
-    }
-  }
-
   void Save(dmlc::Stream* fo) const override {
     this->CheckModelInitialized();
 
@@ -1154,47 +962,23 @@ class LearnerIO : public LearnerConfiguration {
     common::PeekableInStream fp(fi);
     char header[2];
     fp.PeekRead(header, 2);
-    if (header[0] == '{') {
-      auto buffer = common::ReadAll(fi, &fp);
-      Json memory_snapshot;
-      if (header[1] == '"') {
-        memory_snapshot = Json::Load(StringView{buffer});
-        error::WarnOldSerialization();
-      } else if (std::isalpha(header[1])) {
-        memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
-      } else {
-        LOG(FATAL) << "Invalid serialization file.";
-      }
-      if (IsA<Null>(memory_snapshot["Model"])) {
-        // R has xgb.load that doesn't distinguish whether configuration is saved.
-        // We should migrate to use `xgb.load.raw` instead.
-        this->LoadModel(memory_snapshot);
-      } else {
-        this->LoadModel(memory_snapshot["Model"]);
-        this->LoadConfig(memory_snapshot["Config"]);
-      }
+    StringView msg = "Invalid serialization file.";
+    CHECK_EQ(header[0], '{') << msg;
+
+    auto buffer = common::ReadAll(fi, &fp);
+    Json memory_snapshot;
+    CHECK(std::isalpha(header[1])) << msg;
+    if (header[1] == '"') {
+      memory_snapshot = Json::Load(StringView{buffer});
+      error::WarnOldSerialization();
+    } else if (std::isalpha(header[1])) {
+      memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
     } else {
-      std::string header;
-      header.resize(serialisation_header_.size());
-      CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
-      // Avoid printing the content in loaded header, which might be random binary code.
-      CHECK(header == serialisation_header_) << error::OldSerialization();
-      int64_t sz {-1};
-      CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
-      if (!DMLC_IO_NO_ENDIAN_SWAP) {
-        dmlc::ByteSwap(&sz, sizeof(sz), 1);
-      }
-      CHECK_GT(sz, 0);
-      size_t json_offset = static_cast<size_t>(sz);
-      std::string buffer;
-      common::FixedSizeStream{&fp}.Take(&buffer);
-
-      common::MemoryFixSizeBuffer binary_buf(&buffer[0], json_offset);
-      this->LoadModel(&binary_buf);
-
-      auto config = Json::Load({buffer.c_str() + json_offset, buffer.size() - json_offset});
-      this->LoadConfig(config);
+      LOG(FATAL) << "Invalid serialization file.";
     }
+
+    this->LoadModel(memory_snapshot["Model"]);
+    this->LoadConfig(memory_snapshot["Config"]);
   }
 };
 
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 6502cc707f81..e0978759d5e8 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -910,76 +910,6 @@ void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
   this->split_categories_segments_.at(nidx).size = split_cat.size();
 }
 
-void RegTree::Load(dmlc::Stream* fi) {
-  CHECK_EQ(fi->Read(&param_, sizeof(TreeParam)), sizeof(TreeParam));
-  if (!DMLC_IO_NO_ENDIAN_SWAP) {
-    param_ = param_.ByteSwap();
-  }
-  nodes_.resize(param_.num_nodes);
-  stats_.resize(param_.num_nodes);
-  CHECK_NE(param_.num_nodes, 0);
-  CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
-           sizeof(Node) * nodes_.size());
-  if (!DMLC_IO_NO_ENDIAN_SWAP) {
-    for (Node& node : nodes_) {
-      node = node.ByteSwap();
-    }
-  }
-  CHECK_EQ(fi->Read(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * stats_.size()),
-           sizeof(RTreeNodeStat) * stats_.size());
-  if (!DMLC_IO_NO_ENDIAN_SWAP) {
-    for (RTreeNodeStat& stat : stats_) {
-      stat = stat.ByteSwap();
-    }
-  }
-  // chg deleted nodes
-  deleted_nodes_.resize(0);
-  for (int i = 1; i < param_.num_nodes; ++i) {
-    if (nodes_[i].IsDeleted()) {
-      deleted_nodes_.push_back(i);
-    }
-  }
-  CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param_.num_deleted);
-
-  split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
-  split_categories_segments_.resize(param_.num_nodes);
-}
-
-void RegTree::Save(dmlc::Stream* fo) const {
-  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
-  CHECK_EQ(param_.deprecated_num_roots, 1);
-  CHECK_NE(param_.num_nodes, 0);
-  CHECK(!IsMultiTarget())
-      << "Please use JSON/UBJSON for saving models with multi-target trees.";
-  CHECK(!HasCategoricalSplit())
-      << "Please use JSON/UBJSON for saving models with categorical splits.";
-
-  if (DMLC_IO_NO_ENDIAN_SWAP) {
-    fo->Write(&param_, sizeof(TreeParam));
-  } else {
-    TreeParam x = param_.ByteSwap();
-    fo->Write(&x, sizeof(x));
-  }
-
-  if (DMLC_IO_NO_ENDIAN_SWAP) {
-    fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size());
-  } else {
-    for (const Node& node : nodes_) {
-      Node x = node.ByteSwap();
-      fo->Write(&x, sizeof(x));
-    }
-  }
-  if (DMLC_IO_NO_ENDIAN_SWAP) {
-    fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
-  } else {
-    for (const RTreeNodeStat& stat : stats_) {
-      RTreeNodeStat x = stat.ByteSwap();
-      fo->Write(&x, sizeof(x));
-    }
-  }
-}
-
 template <bool typed>
 void RegTree::LoadCategoricalSplit(Json const& in) {
   auto const& categories_segments = get<I64ArrayT<typed>>(in["categories_segments"]);
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 7e290a3dcadc..59068c403254 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -769,4 +769,11 @@ TEST(Json, Dump) {
     ASSERT_EQ(result_s[i], result_v[i]);
   }
 }
+
+TEST(Json, NonNullTerminated) {
+  // garbage at the end, not terminated by \0
+  std::vector<char> str{'{', '"', 'a', '"', ':', '"', 'b', '"', '}', 'c', 'c'};
+  auto jobj = Json::Load(StringView{str.data(), str.size()});
+  ASSERT_EQ(get<String const>(jobj["a"]), "b");
+}
 }  // namespace xgboost
diff --git a/tests/python/generate_models.py b/tests/python/generate_models.py
index 2a2444e8f822..57e0f9ed26e0 100644
--- a/tests/python/generate_models.py
+++ b/tests/python/generate_models.py
@@ -1,147 +1,206 @@
 import os
 
 import numpy as np
+from sklearn.datasets import make_classification
 
 import xgboost
+from xgboost.testing import make_categorical, make_ltr
 
-kRounds = 2
+kRounds = 4
 kRows = 1000
 kCols = 4
 kForests = 2
-kMaxDepth = 2
+kMaxDepth = 3
 kClasses = 3
 
-X = np.random.randn(kRows, kCols)
-w = np.random.uniform(size=kRows)
 
 version = xgboost.__version__
 
-np.random.seed(1994)
-target_dir = 'models'
+target_dir = "models"
 
 
-def booster_bin(model):
-    return os.path.join(target_dir,
-                        'xgboost-' + version + '.' + model + '.bin')
+def booster_ubj(model: str) -> str:
+    return os.path.join(target_dir, "xgboost-" + version + "." + model + ".ubj")
 
 
-def booster_json(model):
-    return os.path.join(target_dir,
-                        'xgboost-' + version + '.' + model + '.json')
+def booster_json(model: str) -> str:
+    return os.path.join(target_dir, "xgboost-" + version + "." + model + ".json")
 
 
-def skl_bin(model):
-    return os.path.join(target_dir,
-                        'xgboost_scikit-' + version + '.' + model + '.bin')
+def skl_ubj(model: str) -> str:
+    return os.path.join(target_dir, "xgboost_scikit-" + version + "." + model + ".ubj")
 
 
-def skl_json(model):
-    return os.path.join(target_dir,
-                        'xgboost_scikit-' + version + '.' + model + '.json')
+def skl_json(model: str) -> str:
+    return os.path.join(target_dir, "xgboost_scikit-" + version + "." + model + ".json")
 
 
-def generate_regression_model():
-    print('Regression')
-    y = np.random.randn(kRows)
+def generate_regression_model() -> None:
+    print("Regression")
+    X, y = make_categorical(
+        n_samples=kRows, n_features=kCols, n_categories=16, onehot=False, cat_ratio=0.5
+    )
+    w = np.random.default_rng(2025).uniform(size=X.shape[0])
+    data = xgboost.DMatrix(X, label=y, weight=w, enable_categorical=True)
+    booster = xgboost.train(
+        {
+            "tree_method": "hist",
+            "num_parallel_tree": kForests,
+            "max_depth": kMaxDepth,
+            "base_score": 0.5,
+        },
+        num_boost_round=kRounds,
+        dtrain=data,
+    )
+    booster.save_model(booster_ubj("reg"))
+    booster.save_model(booster_json("reg"))
 
-    data = xgboost.DMatrix(X, label=y, weight=w)
-    booster = xgboost.train({'tree_method': 'hist',
-                             'num_parallel_tree': kForests,
-                             'max_depth': kMaxDepth},
-                            num_boost_round=kRounds, dtrain=data)
-    booster.save_model(booster_bin('reg'))
-    booster.save_model(booster_json('reg'))
-
-    reg = xgboost.XGBRegressor(tree_method='hist',
-                               num_parallel_tree=kForests,
-                               max_depth=kMaxDepth,
-                               n_estimators=kRounds)
-    reg.fit(X, y, w)
-    reg.save_model(skl_bin('reg'))
-    reg.save_model(skl_json('reg'))
-
-
-def generate_logistic_model():
-    print('Logistic')
-    y = np.random.randint(0, 2, size=kRows)
+    reg = xgboost.XGBRegressor(
+        tree_method="hist",
+        num_parallel_tree=kForests,
+        max_depth=kMaxDepth,
+        n_estimators=kRounds,
+        base_score=0.5,
+        enable_categorical=True,
+    )
+    reg.fit(X, y, sample_weight=w)
+    reg.save_model(skl_ubj("reg"))
+    reg.save_model(skl_json("reg"))
+
+
+def generate_logistic_model() -> None:
+    print("Logistic")
+    X, y = make_classification(n_samples=kRows, n_features=kCols, random_state=2025)
     assert y.max() == 1 and y.min() == 0
+    w = np.random.default_rng(2025).uniform(size=X.shape[0])
 
-    for objective, name in [('binary:logistic', 'logit'), ('binary:logitraw', 'logitraw')]:
+    for objective, name in [
+        ("binary:logistic", "logit"),
+        ("binary:logitraw", "logitraw"),
+    ]:
         data = xgboost.DMatrix(X, label=y, weight=w)
-        booster = xgboost.train({'tree_method': 'hist',
-                                 'num_parallel_tree': kForests,
-                                 'max_depth': kMaxDepth,
-                                 'objective': objective},
-                                num_boost_round=kRounds, dtrain=data)
-        booster.save_model(booster_bin(name))
+        booster = xgboost.train(
+            {
+                "tree_method": "hist",
+                "num_parallel_tree": kForests,
+                "max_depth": kMaxDepth,
+                "objective": objective,
+                "base_score": 0.5,
+            },
+            num_boost_round=kRounds,
+            dtrain=data,
+        )
+        booster.save_model(booster_ubj(name))
         booster.save_model(booster_json(name))
 
-        reg = xgboost.XGBClassifier(tree_method='hist',
-                                    num_parallel_tree=kForests,
-                                    max_depth=kMaxDepth,
-                                    n_estimators=kRounds,
-                                    objective=objective)
-        reg.fit(X, y, w)
-        reg.save_model(skl_bin(name))
+        reg = xgboost.XGBClassifier(
+            tree_method="hist",
+            num_parallel_tree=kForests,
+            max_depth=kMaxDepth,
+            n_estimators=kRounds,
+            objective=objective,
+            base_score=0.5,
+        )
+        reg.fit(X, y, sample_weight=w)
+        reg.save_model(skl_ubj(name))
         reg.save_model(skl_json(name))
 
 
-def generate_classification_model():
-    print('Classification')
-    y = np.random.randint(0, kClasses, size=kRows)
+def generate_classification_model() -> None:
+    print("Classification")
+    X, y = make_classification(
+        n_samples=kRows,
+        n_features=kCols,
+        random_state=2025,
+        n_classes=kClasses,
+        n_informative=4,
+        n_redundant=0,
+    )
+    w = np.random.default_rng(2025).uniform(size=X.shape[0])
+
     data = xgboost.DMatrix(X, label=y, weight=w)
-    booster = xgboost.train({'num_class': kClasses,
-                             'tree_method': 'hist',
-                             'num_parallel_tree': kForests,
-                             'max_depth': kMaxDepth},
-                            num_boost_round=kRounds, dtrain=data)
-    booster.save_model(booster_bin('cls'))
-    booster.save_model(booster_json('cls'))
-
-    cls = xgboost.XGBClassifier(tree_method='hist',
-                                num_parallel_tree=kForests,
-                                max_depth=kMaxDepth,
-                                n_estimators=kRounds)
-    cls.fit(X, y, w)
-    cls.save_model(skl_bin('cls'))
-    cls.save_model(skl_json('cls'))
-
-
-def generate_ranking_model():
-    print('Learning to Rank')
-    y = np.random.randint(5, size=kRows)
-    w = np.random.uniform(size=20)
-    g = np.repeat(50, 20)
-
-    data = xgboost.DMatrix(X, y, weight=w)
-    data.set_group(g)
-    booster = xgboost.train({'objective': 'rank:ndcg',
-                             'num_parallel_tree': kForests,
-                             'tree_method': 'hist',
-                             'max_depth': kMaxDepth},
-                            num_boost_round=kRounds,
-                            dtrain=data)
-    booster.save_model(booster_bin('ltr'))
-    booster.save_model(booster_json('ltr'))
-
-    ranker = xgboost.sklearn.XGBRanker(n_estimators=kRounds,
-                                       tree_method='hist',
-                                       objective='rank:ndcg',
-                                       max_depth=kMaxDepth,
-                                       num_parallel_tree=kForests)
-    ranker.fit(X, y, g, sample_weight=w)
-    ranker.save_model(skl_bin('ltr'))
-    ranker.save_model(skl_json('ltr'))
-
-
-def write_versions():
-    versions = {'numpy': np.__version__,
-                'xgboost': version}
-    with open(os.path.join(target_dir, 'version'), 'w') as fd:
-        fd.write(str(versions))
-
-
-if __name__ == '__main__':
+    booster = xgboost.train(
+        {
+            "num_class": kClasses,
+            "tree_method": "hist",
+            "num_parallel_tree": kForests,
+            "max_depth": kMaxDepth,
+        },
+        num_boost_round=kRounds,
+        dtrain=data,
+    )
+    booster.save_model(booster_ubj("cls"))
+    booster.save_model(booster_json("cls"))
+
+    cls = xgboost.XGBClassifier(
+        tree_method="hist",
+        num_parallel_tree=kForests,
+        max_depth=kMaxDepth,
+        n_estimators=kRounds,
+    )
+    cls.fit(X, y, sample_weight=w)
+    cls.save_model(skl_ubj("cls"))
+    cls.save_model(skl_json("cls"))
+
+
+def generate_ranking_model() -> None:
+    print("Learning to Rank")
+    X, y, qid, w = make_ltr(
+        n_samples=kRows, n_features=kCols, n_query_groups=7, max_rel=3
+    )
+
+    data = xgboost.DMatrix(X, y, weight=w, qid=qid)
+    booster = xgboost.train(
+        {
+            "objective": "rank:ndcg",
+            "num_parallel_tree": kForests,
+            "tree_method": "hist",
+            "max_depth": kMaxDepth,
+            "base_score": 0.5,
+        },
+        num_boost_round=kRounds,
+        dtrain=data,
+    )
+    booster.save_model(booster_ubj("ltr"))
+    booster.save_model(booster_json("ltr"))
+
+    ranker = xgboost.sklearn.XGBRanker(
+        n_estimators=kRounds,
+        tree_method="hist",
+        objective="rank:ndcg",
+        max_depth=kMaxDepth,
+        num_parallel_tree=kForests,
+        base_score=0.5,
+    )
+    ranker.fit(X, y, qid=qid, sample_weight=w)
+    ranker.save_model(skl_ubj("ltr"))
+    ranker.save_model(skl_json("ltr"))
+
+
+def generate_aft_survival_models() -> None:
+    print("AFT Survival")
+    X, y_lower = make_categorical(
+        n_samples=kRows, n_features=kCols, n_categories=16, onehot=False, cat_ratio=0.5
+    )
+    w = np.random.default_rng(2025).uniform(size=X.shape[0])
+    y_upper = y_lower + np.mean(y_lower) + w
+    data = xgboost.QuantileDMatrix(
+        X, label_lower_bound=y_lower, label_upper_bound=y_upper, enable_categorical=True
+    )
+    params = {
+        "num_parallel_tree": kForests,
+        "tree_method": "hist",
+        "max_depth": kMaxDepth,
+        "objective": "survival:aft",
+        "aft_loss_distribution": "normal",
+        "base_score": 0.5,
+    }
+    booster = xgboost.train(params, num_boost_round=kRounds, dtrain=data)
+    booster.save_model(booster_ubj("aft"))
+    booster.save_model(booster_json("aft"))
+
+
+if __name__ == "__main__":
     if not os.path.exists(target_dir):
         os.mkdir(target_dir)
 
@@ -149,4 +208,4 @@ def write_versions():
     generate_logistic_model()
     generate_classification_model()
     generate_ranking_model()
-    write_versions()
+    generate_aft_survival_models()
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index f2392353ac6c..9f8564657c20 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -450,12 +450,6 @@ def test_slice(self, booster_name: str) -> None:
             booster, dtrain, num_parallel_tree, num_classes, num_boost_round, False
         )
 
-        bytesarray = booster.save_raw(raw_format="deprecated")
-        booster = xgb.Booster(model_file=bytesarray)
-        self.run_slice(
-            booster, dtrain, num_parallel_tree, num_classes, num_boost_round, True
-        )
-
     def test_slice_multi(self) -> None:
         from sklearn.datasets import make_classification
 
diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py
index 334be264e899..e484b0065fd1 100644
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@@ -177,7 +177,7 @@ def test_cli_save_model(self):
         seed = 1994
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            model_out_cli = os.path.join(tmpdir, '0010.model')
+            model_out_cli = os.path.join(tmpdir, '0010.ubj')
             config_path = os.path.join(tmpdir, 'test_load_cli_model.conf')
 
             train_conf = self.template.format(data_path=data_path,
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 9f7bd7123fde..65e56dabf6c3 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -244,7 +244,7 @@ def test_cli_binary_classification() -> None:
         pytest.skip("CLI executable not found.")
     with tm.DirectoryExcursion(cls_dir, cleanup=True):
         subprocess.check_call(["./runexp.sh"])
-        os.remove("0002.model")
+        os.remove("0002.ubj")
 
 
 # year prediction is not tested due to data size being too large.
diff --git a/tests/python/test_model_compatibility.py b/tests/python/test_model_compatibility.py
index c9b7646efde7..41de6522554f 100644
--- a/tests/python/test_model_compatibility.py
+++ b/tests/python/test_model_compatibility.py
@@ -1,8 +1,10 @@
 import copy
+import hashlib
 import json
 import os
 import urllib.request
 import zipfile
+from typing import Any, Dict
 
 import generate_models as gm
 import pytest
@@ -11,120 +13,143 @@
 from xgboost import testing as tm
 
 
-def run_model_param_check(config):
-    assert config['learner']['learner_model_param']['num_feature'] == str(4)
-    assert config['learner']['learner_train_param']['booster'] == 'gbtree'
+def run_model_param_check(name: str, config: Dict[str, Any]) -> None:
+    assert config["learner"]["learner_model_param"]["num_feature"] == str(4)
+    assert config["learner"]["learner_train_param"]["booster"] == "gbtree"
 
+    booster = config["learner"]["gradient_booster"]
+    assert booster["name"] == "gbtree"
+    if name.find("1.0.0rc1") != -1:
+        # There's no `num_parallel_tree` in the model parameter in 1.0 (it was a
+        # configuration instead of a model parameter).
+        return
+    assert booster["gbtree_model_param"]["num_parallel_tree"] == str(gm.kForests)
 
-def run_booster_check(booster, name):
+
+def run_booster_check(booster: xgboost.Booster, name: str) -> None:
     config = json.loads(booster.save_config())
-    run_model_param_check(config)
-    if name.find('cls') != -1:
-        assert (len(booster.get_dump()) == gm.kForests * gm.kRounds *
-                gm.kClasses)
-        assert float(
-            config['learner']['learner_model_param']['base_score']) == 0.5
-        assert config['learner']['learner_train_param'][
-            'objective'] == 'multi:softmax'
-    elif name.find('logitraw') != -1:
-        assert len(booster.get_dump()) == gm.kForests * gm.kRounds
-        assert config['learner']['learner_model_param']['num_class'] == str(0)
-        assert config['learner']['learner_train_param']['objective'] == 'binary:logitraw'
-    elif name.find('logit') != -1:
-        assert len(booster.get_dump()) == gm.kForests * gm.kRounds
-        assert config['learner']['learner_model_param']['num_class'] == str(0)
-        assert config['learner']['learner_train_param'][
-            'objective'] == 'binary:logistic'
-    elif name.find('ltr') != -1:
-        assert config['learner']['learner_train_param'][
-            'objective'] == 'rank:ndcg'
+    run_model_param_check(name, config)
+    n_rounds = get_n_rounds(name)
+    if name.find("cls") != -1:
+        assert len(booster.get_dump()) == gm.kForests * n_rounds * gm.kClasses
+        assert float(config["learner"]["learner_model_param"]["base_score"]) == 0.5
+        assert config["learner"]["learner_train_param"]["objective"] == "multi:softmax"
+    elif name.find("logitraw") != -1:
+        assert len(booster.get_dump()) == gm.kForests * n_rounds
+        assert config["learner"]["learner_model_param"]["num_class"] == str(0)
+        assert (
+            config["learner"]["learner_train_param"]["objective"] == "binary:logitraw"
+        )
+    elif name.find("logit") != -1:
+        assert len(booster.get_dump()) == gm.kForests * n_rounds
+        assert config["learner"]["learner_model_param"]["num_class"] == str(0)
+        assert (
+            config["learner"]["learner_train_param"]["objective"] == "binary:logistic"
+        )
+    elif name.find("ltr") != -1:
+        assert config["learner"]["learner_train_param"]["objective"] == "rank:ndcg"
+    elif name.find("aft") != -1:
+        assert config["learner"]["learner_train_param"]["objective"] == "survival:aft"
+        assert (
+            config["learner"]["objective"]["aft_loss_param"]["aft_loss_distribution"]
+            == "normal"
+        )
+    else:
+        assert name.find("reg") != -1
+        assert len(booster.get_dump()) == gm.kForests * n_rounds
+        assert float(config["learner"]["learner_model_param"]["base_score"]) == 0.5
+        assert (
+            config["learner"]["learner_train_param"]["objective"] == "reg:squarederror"
+        )
+
+
+def get_n_rounds(name: str) -> int:
+    if name.find("1.0.0rc1") != -1:
+        n_rounds = 2
     else:
-        assert name.find('reg') != -1
-        assert len(booster.get_dump()) == gm.kForests * gm.kRounds
-        assert float(
-            config['learner']['learner_model_param']['base_score']) == 0.5
-        assert config['learner']['learner_train_param'][
-            'objective'] == 'reg:squarederror'
+        n_rounds = gm.kRounds
+    return n_rounds
 
 
-def run_scikit_model_check(name, path):
-    if name.find('reg') != -1:
+def run_scikit_model_check(name: str, path: str) -> None:
+    if name.find("reg") != -1:
         reg = xgboost.XGBRegressor()
         reg.load_model(path)
         config = json.loads(reg.get_booster().save_config())
-        if name.find('0.90') != -1:
-            assert config['learner']['learner_train_param'][
-                'objective'] == 'reg:linear'
-        else:
-            assert config['learner']['learner_train_param'][
-                'objective'] == 'reg:squarederror'
-        assert (len(reg.get_booster().get_dump()) ==
-                gm.kRounds * gm.kForests)
-        run_model_param_check(config)
-    elif name.find('cls') != -1:
+        assert (
+            config["learner"]["learner_train_param"]["objective"] == "reg:squarederror"
+        )
+        assert len(reg.get_booster().get_dump()) == get_n_rounds(name) * gm.kForests
+        run_model_param_check(name, config)
+    elif name.find("cls") != -1:
         cls = xgboost.XGBClassifier()
         cls.load_model(path)
-        if name.find('0.90') == -1:
-            assert len(cls.classes_) == gm.kClasses
-            assert cls.n_classes_ == gm.kClasses
-        assert (len(cls.get_booster().get_dump()) ==
-                gm.kRounds * gm.kForests * gm.kClasses), path
+        n_rounds = get_n_rounds(name)
+        assert (
+            len(cls.get_booster().get_dump()) == n_rounds * gm.kForests * gm.kClasses
+        ), path
         config = json.loads(cls.get_booster().save_config())
-        assert config['learner']['learner_train_param'][
-            'objective'] == 'multi:softprob', path
-        run_model_param_check(config)
-    elif name.find('ltr') != -1:
+        assert (
+            config["learner"]["learner_train_param"]["objective"] == "multi:softprob"
+        ), path
+        run_model_param_check(name, config)
+    elif name.find("ltr") != -1:
         ltr = xgboost.XGBRanker()
         ltr.load_model(path)
-        assert (len(ltr.get_booster().get_dump()) ==
-                gm.kRounds * gm.kForests)
+        assert len(ltr.get_booster().get_dump()) == get_n_rounds(name) * gm.kForests
         config = json.loads(ltr.get_booster().save_config())
-        assert config['learner']['learner_train_param'][
-            'objective'] == 'rank:ndcg'
-        run_model_param_check(config)
-    elif name.find('logitraw') != -1:
+        assert config["learner"]["learner_train_param"]["objective"] == "rank:ndcg"
+        run_model_param_check(name, config)
+    elif name.find("logitraw") != -1:
         logit = xgboost.XGBClassifier()
         logit.load_model(path)
-        assert (len(logit.get_booster().get_dump()) ==
-                gm.kRounds * gm.kForests)
+        assert len(logit.get_booster().get_dump()) == get_n_rounds(name) * gm.kForests
         config = json.loads(logit.get_booster().save_config())
-        assert config['learner']['learner_train_param']['objective'] == 'binary:logitraw'
-    elif name.find('logit') != -1:
+        assert (
+            config["learner"]["learner_train_param"]["objective"] == "binary:logitraw"
+        )
+        run_model_param_check(name, config)
+    elif name.find("logit") != -1:
         logit = xgboost.XGBClassifier()
         logit.load_model(path)
-        assert (len(logit.get_booster().get_dump()) ==
-                gm.kRounds * gm.kForests)
+        assert len(logit.get_booster().get_dump()) == get_n_rounds(name) * gm.kForests
         config = json.loads(logit.get_booster().save_config())
-        assert config['learner']['learner_train_param'][
-            'objective'] == 'binary:logistic'
+        assert (
+            config["learner"]["learner_train_param"]["objective"] == "binary:logistic"
+        )
+        run_model_param_check(name, config)
     else:
         assert False
 
 
-@pytest.mark.skipif(**tm.no_sklearn())
-def test_model_compatibility():
-    """Test model compatibility, can only be run on CI as others don't
-    have the credentials.
+def download(path: str) -> None:
+    """Download the model files from S3."""
+    zip_path, _ = urllib.request.urlretrieve(
+        "https://xgboost-ci-jenkins-artifacts.s3-us-west-2/"
+        + ".amazonaws.com/xgboost_model_compatibility_tests-3.0.2.zip"
+    )
+    sha = "49d4d4db667a73590099dad9dca4f078532df05c5ea6e035ad4fa09596b1905a"
+    if hasattr(hashlib, "file_digest"):  # not in py 3.10
+        with open(zip_path, "rb") as fd:
+            digest = hashlib.file_digest(fd, "sha256")  # pylint: disable=attr-defined
+            assert digest.hexdigest() == sha
+    with zipfile.ZipFile(zip_path, "r") as z:
+        z.extractall(path)
+
 
-    """
+@pytest.mark.skipif(**tm.no_sklearn())
+def test_model_compatibility() -> None:
+    """Test model compatibility."""
     path = os.path.dirname(os.path.abspath(__file__))
     path = os.path.join(path, "models")
 
     if not os.path.exists(path):
-        zip_path, _ = urllib.request.urlretrieve(
-            "https://xgboost-ci-jenkins-artifacts.s3-us-west-2/"
-            + ".amazonaws.com/xgboost_model_compatibility_test.zip"
-        )
-        with zipfile.ZipFile(zip_path, "r") as z:
-            z.extractall(path)
+        download(path)
 
     models = [
-        os.path.join(root, f)
-        for root, subdir, files in os.walk(path)
-        for f in files
-        if f != "version"
+        os.path.join(root, f) for root, subdir, files in os.walk(path) for f in files
     ]
-    assert models
+    assert len(models) == 54
 
     for path in models:
         name = os.path.basename(path)
diff --git a/tests/python/test_model_io.py b/tests/python/test_model_io.py
index 65e85550944c..c1dc07888f5a 100644
--- a/tests/python/test_model_io.py
+++ b/tests/python/test_model_io.py
@@ -240,11 +240,28 @@ def rename(src: str, dst: str) -> None:
             with pytest.warns(UserWarning, match="UBJSON"):
                 booster.save_model(path_no)
 
-            booster_1 = xgb.Booster(model_file=path_no)
+            with pytest.warns(UserWarning, match="Using UBJSON"):
+                booster_1 = xgb.Booster(model_file=path_no)
             r0 = booster.save_raw(raw_format="json")
             r1 = booster_1.save_raw(raw_format="json")
             assert r0 == r1
 
+            booster.save_model(path_json)
+            rename(path_json, path_no)
+            with pytest.warns(UserWarning, match="Using JSON"):
+                xgb.Booster(model_file=path_no)
+
+    def test_invalid_format(self) -> None:
+        X, y, w = tm.make_regression(64, 16, False)
+        booster = xgb.train({}, xgb.QuantileDMatrix(X, y, weight=w), num_boost_round=3)
+        with pytest.raises(ValueError, match="Unknown model format"):
+            booster.save_raw(raw_format="deprecated")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "model.deprecated")
+            with pytest.warns(UserWarning, match="Saving model in the UBJSON format"):
+                booster.save_model(path)
+
 
 def save_load_model(model_path: str) -> None:
     from sklearn.datasets import load_digits

From 34ed70ed85293770037909d1ea7457296954b5a8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 18 Jul 2025 01:58:39 +0800
Subject: [PATCH 104/224] [EM] Small tests for the no-concat option. (#11566)

- Update document for the new adaptive cache.
---
 python-package/xgboost/core.py                 |  2 +-
 src/data/data.cc                               |  1 -
 tests/cpp/data/test_ellpack_page_raw_format.cu | 11 ++++++++++-
 tests/cpp/data/test_extmem_quantile_dmatrix.cu |  7 +++----
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 1798a61e8190..e63835e78578 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -540,7 +540,7 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
         with GPU-based :py:class:`ExtMemQuantileDMatrix`. When using GPU-based external
         memory with the data cached in the host memory, XGBoost can concatenate the
         pages internally to increase the batch size for the GPU. The default page size
-        is about 1/8 of the total device memory. Users can manually set the value based
+        is about 1/16 of the total device memory. Users can manually set the value based
         on the actual hardware and datasets. Set this to 0 to disable page
         concatenation.
 
diff --git a/src/data/data.cc b/src/data/data.cc
index b4e18b406b7b..f6cdadd1250d 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -35,7 +35,6 @@
 #include "../data/iterative_dmatrix.h"        // for IterativeDMatrix
 #include "./sparse_page_dmatrix.h"            // for SparsePageDMatrix
 #include "array_interface.h"                  // for ArrayInterfaceHandler, ArrayInterface, Dispa...
-#include "batch_utils.h"                      // for MatchingPageBytes
 #include "cat_container.h"                    // for CatContainer
 #include "dmlc/base.h"                        // for BeginPtr
 #include "dmlc/common.h"                      // for OMPException
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 909052be8803..ea9a52f571cf 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -198,7 +198,11 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
       EXPECT_TRUE(page.Impl()->IsDense());
       CalcCacheMapping(&ctx, page.Impl()->IsDense(), cuts, min_cache_page_bytes, ext_info, false,
                        &cinfo);
-      EXPECT_EQ(cinfo.buffer_rows.size(), 4ul);
+      if (min_cache_page_bytes == ::xgboost::cuda_impl::MatchingPageBytes()) {
+        EXPECT_EQ(cinfo.NumBatchesCc(), ext_info.n_batches);
+      } else {
+        EXPECT_EQ(cinfo.buffer_rows.size(), 4ul);
+      }
       policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
     }
 
@@ -219,6 +223,11 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
     return mem_cache;
   };
 
+  {
+    auto mem_cache =
+        test(::xgboost::cuda_impl::MatchingPageBytes(), ::xgboost::cuda_impl::AutoHostRatio());
+    ASSERT_EQ(mem_cache->d_pages.size(), 8);
+  }
   {
     auto mem_cache = test(n_features * n_samples, ::xgboost::cuda_impl::AutoHostRatio());
     ASSERT_EQ(mem_cache->h_pages.size(), 4);
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index 2efcf1a03480..351acfa31789 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -74,7 +74,7 @@ class EllpackHostCacheTest : public ::testing::TestWithParam<std::tuple<double,
     auto p_fmat = RandomDataGenerator{NumSamples(), NumFeatures(), sparsity}
                       .Device(ctx.Device())
                       .GenerateDMatrix();
-    bst_idx_t min_page_cache_bytes = 0;
+    bst_idx_t min_page_cache_bytes = ::xgboost::cuda_impl::MatchingPageBytes();
     if (is_concat) {
       min_page_cache_bytes =
           p_fmat->GetBatches<EllpackPage>(&ctx, param).begin().Page()->Impl()->MemCostBytes() / 3;
@@ -111,9 +111,8 @@ class EllpackHostCacheTest : public ::testing::TestWithParam<std::tuple<double,
 };
 
 TEST_P(EllpackHostCacheTest, Basic) {
-  auto ctx = MakeCUDACtx(0);
-  auto [sparsity, min_page_cache_bytes, cache_host_ratio] = this->GetParam();
-  this->Run(sparsity, min_page_cache_bytes, cache_host_ratio);
+  auto [sparsity, is_concat, cache_host_ratio] = this->GetParam();
+  this->Run(sparsity, is_concat, cache_host_ratio);
 }
 
 INSTANTIATE_TEST_SUITE_P(

From 92e2c970c7773eebe2d801a756d6e55ea058da80 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 18 Jul 2025 04:59:13 +0800
Subject: [PATCH 105/224] [enc] Add a cat accessor to the booster. (#11568)

---
 include/xgboost/gbm.h                     |  32 ++++--
 include/xgboost/learner.h                 |  24 ++--
 python-package/xgboost/core.py            | 132 +++++++++++++---------
 python-package/xgboost/testing/ordinal.py |  42 ++++++-
 src/c_api/c_api.cc                        | 106 +++++++++++------
 src/gbm/gbm.cc                            |   6 +-
 src/gbm/gbtree.h                          |   2 +
 src/learner.cc                            |   4 +
 8 files changed, 231 insertions(+), 117 deletions(-)

diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index d7b143a61609..9a74594ae8cc 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -24,6 +24,7 @@ namespace xgboost {
 class Json;
 class FeatureMap;
 class ObjFunction;
+class CatContainer;
 
 struct Context;
 struct LearnerModelParam;
@@ -135,12 +136,12 @@ class GradientBooster : public Model, public Configurable {
                                                bst_layer_t layer_begin, bst_layer_t layer_end,
                                                bool approximate) = 0;
 
-  /*!
-   * \brief dump the model in the requested format
-   * \param fmap feature map that may help give interpretations of feature
-   * \param with_stats extra statistics while dumping model
-   * \param format the format to dump the model in
-   * \return a vector of dump for boosters.
+  /**
+   * @brief dump the model in the requested format
+   * @param fmap feature map that may help give interpretations of feature
+   * @param with_stats extra statistics while dumping model
+   * @param format the format to dump the model in
+   * @return a vector of dump for boosters.
    */
   [[nodiscard]] virtual std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
                                                            std::string format) const = 0;
@@ -149,12 +150,19 @@ class GradientBooster : public Model, public Configurable {
                             common::Span<int32_t const> trees,
                             std::vector<bst_feature_t>* features,
                             std::vector<float>* scores) const = 0;
-  /*!
-   * \brief create a gradient booster from given name
-   * \param name name of gradient booster
-   * \param generic_param Pointer to runtime parameters
-   * \param learner_model_param pointer to global model parameters
-   * \return The created booster.
+  /**
+   * @brief Getter for categories.
+   */
+  [[nodiscard]] virtual CatContainer const* Cats() const {
+    LOG(FATAL) << "Retrieving categories is not supported by the current booster.";
+    return nullptr;
+  }
+  /**
+   * @brief create a gradient booster from given name
+   * @param name name of gradient booster
+   * @param generic_param Pointer to runtime parameters
+   * @param learner_model_param pointer to global model parameters
+   * @return The created booster.
    */
   static GradientBooster* Create(const std::string& name, Context const* ctx,
                                  LearnerModelParam const* learner_model_param);
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 43e97babf2a8..e9c8ad1ccccd 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -1,6 +1,6 @@
 /**
  * Copyright 2015-2025, XGBoost Contributors
- * \file learner.h
+ *
  * \brief Learner interface that integrates objective, gbm and evaluation together.
  *  This is the user facing XGBoost training module.
  * \author Tianqi Chen
@@ -35,6 +35,7 @@ class Json;
 struct XGBAPIThreadLocalEntry;
 template <typename T>
 class HostDeviceVector;
+class CatContainer;
 
 enum class PredictionType : std::uint8_t {  // NOLINT
   kValue = 0,
@@ -167,11 +168,11 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    */
   virtual void SetParam(const std::string& key, const std::string& value) = 0;
 
-  /*!
-   * \brief Get the number of features of the booster.
-   * \return number of features
+  /**
+   * @brief Get the number of features of the booster.
+   * @return The number of features
    */
-  virtual uint32_t GetNumFeature() const = 0;
+  virtual bst_feature_t GetNumFeature() const = 0;
 
   /*!
    * \brief Set additional attribute to the Booster.
@@ -221,16 +222,19 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * \param fn Output feature types
    */
   virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;
-
   /**
-   * \brief Slice the model.
+   * @brief Getter for categories.
+   */
+  [[nodiscard]] virtual CatContainer const* Cats() const = 0;
+  /**
+   * @brief Slice the model.
    *
    * See InplacePredict for layer parameters.
    *
-   * \param step step size between slice.
-   * \param out_of_bound Return true if end layer is out of bound.
+   * @param step step size between slice.
+   * @param out_of_bound Return true if end layer is out of bound.
    *
-   * \return a sliced model.
+   * @return a sliced model.
    */
   virtual Learner* Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step,
                          bool* out_of_bound) = 0;
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index e63835e78578..9bc80713daf4 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -779,6 +779,64 @@ def inner_f(*args: Any, **kwargs: Any) -> _T:
 _deprecate_positional_args = require_keyword_args(False)
 
 
+def _get_categories(
+    cfn: Callable[[ctypes.c_char_p], int],
+    feature_names: Optional[FeatureNames],
+    n_features: int,
+) -> Optional[Dict[str, "pa.DictionaryArray"]]:
+    if not is_pyarrow_available():
+        raise ImportError("`pyarrow` is required for exporting categories.")
+
+    if TYPE_CHECKING:
+        import pyarrow as pa
+    else:
+        pa = import_pyarrow()
+
+    fnames = feature_names
+    if fnames is None:
+        fnames = [str(i) for i in range(n_features)]
+
+    results: Dict[str, "pa.DictionaryArray"] = {}
+
+    ret = ctypes.c_char_p()
+    _check_call(cfn(ret))
+    if ret.value is None:
+        return None
+
+    retstr = ret.value.decode()  # pylint: disable=no-member
+    jcats = json.loads(retstr)
+    assert isinstance(jcats, list) and len(jcats) == n_features
+
+    for fidx in range(n_features):
+        f_jcats = jcats[fidx]
+        if f_jcats is None:
+            # Numeric data
+            results[fnames[fidx]] = None
+            continue
+
+        if "offsets" not in f_jcats:
+            values = from_array_interface(f_jcats)
+            pa_values = pa.Array.from_pandas(values)
+            results[fnames[fidx]] = pa_values
+            continue
+
+        joffsets = f_jcats["offsets"]
+        jvalues = f_jcats["values"]
+        offsets = from_array_interface(joffsets, True)
+        values = from_array_interface(jvalues, True)
+        pa_offsets = pa.array(offsets).buffers()
+        pa_values = pa.array(values).buffers()
+        assert (
+            pa_offsets[0] is None and pa_values[0] is None
+        ), "Should not have null mask."
+        pa_dict = pa.StringArray.from_buffers(
+            len(offsets) - 1, pa_offsets[1], pa_values[1]
+        )
+        results[fnames[fidx]] = pa_dict
+
+    return results
+
+
 @unique
 class DataSplitMode(IntEnum):
     """Supported data split mode for DMatrix."""
@@ -1299,58 +1357,11 @@ def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
         .. versionadded:: 3.1.0
 
         """
-        if not is_pyarrow_available():
-            raise ImportError("`pyarrow` is required for exporting categories.")
-
-        if TYPE_CHECKING:
-            import pyarrow as pa
-        else:
-            pa = import_pyarrow()
-
-        n_features = self.num_col()
-        fnames = self.feature_names
-        if fnames is None:
-            fnames = [str(i) for i in range(n_features)]
-
-        results: Dict[str, "pa.DictionaryArray"] = {}
-
-        ret = ctypes.c_char_p()
-        _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)))
-        if ret.value is None:
-            return None
-
-        retstr = ret.value.decode()  # pylint: disable=no-member
-        jcats = json.loads(retstr)
-        assert isinstance(jcats, list) and len(jcats) == n_features
-
-        for fidx in range(n_features):
-            f_jcats = jcats[fidx]
-            if f_jcats is None:
-                # Numeric data
-                results[fnames[fidx]] = None
-                continue
-
-            if "offsets" not in f_jcats:
-                values = from_array_interface(f_jcats)
-                pa_values = pa.Array.from_pandas(values)
-                results[fnames[fidx]] = pa_values
-                continue
-
-            joffsets = f_jcats["offsets"]
-            jvalues = f_jcats["values"]
-            offsets = from_array_interface(joffsets, True)
-            values = from_array_interface(jvalues, True)
-            pa_offsets = pa.array(offsets).buffers()
-            pa_values = pa.array(values).buffers()
-            assert (
-                pa_offsets[0] is None and pa_values[0] is None
-            ), "Should not have null mask."
-            pa_dict = pa.StringArray.from_buffers(
-                len(offsets) - 1, pa_offsets[1], pa_values[1]
-            )
-            results[fnames[fidx]] = pa_dict
-
-        return results
+        return _get_categories(
+            lambda ret: _LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)),
+            self.feature_names,
+            self.num_col(),
+        )
 
     def num_row(self) -> int:
         """Get the number of rows in the DMatrix."""
@@ -2312,6 +2323,23 @@ def feature_names(self) -> Optional[FeatureNames]:
     def feature_names(self, features: Optional[FeatureNames]) -> None:
         self._set_feature_info(features, "feature_name")
 
+    def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
+        """Get the categories in the dataset using `pyarrow`. Returns `None` if there's
+        no categorical features.
+
+        .. warning::
+
+            This function is still working in progress.
+
+        .. versionadded:: 3.1.0
+
+        """
+        return _get_categories(
+            lambda ret: _LIB.XGBoosterGetCategories(self.handle, ctypes.byref(ret)),
+            self.feature_names,
+            self.num_features(),
+        )
+
     def set_param(
         self,
         params: Union[Dict, Iterable[Tuple[str, Any]], str],
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 7e76c7a751f0..a5c296d9424b 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -14,7 +14,12 @@
 from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
 from ..data import _lazy_load_cudf_is_cat
 from ..training import train
-from .data import IteratorForTest, is_pd_cat_dtype, make_categorical
+from .data import (
+    IteratorForTest,
+    is_pd_cat_dtype,
+    make_batches,
+    make_categorical,
+)
 
 
 def get_df_impl(device: str) -> Tuple[Type, Type]:
@@ -50,10 +55,24 @@ def assert_allclose(device: str, a: Any, b: Any) -> None:
         cp.testing.assert_allclose(a, b)
 
 
+def comp_booster(device: Literal["cpu", "cuda"], Xy: DMatrix, booster: str) -> None:
+    """Compare the results from DMatrix and Booster."""
+    cats = Xy.get_categories()
+    assert cats is not None
+
+    rng = np.random.default_rng(2025)
+    Xy.set_label(rng.normal(size=Xy.num_row()))
+    bst = train({"booster": booster, "device": device}, Xy, 1)
+    cats_bst = bst.get_categories()
+    assert cats_bst is not None
+    for k, v in cats_bst.items():
+        assert v == cats[k]
+
+
 def run_cat_container(device: Literal["cpu", "cuda"]) -> None:
     """Basic tests for the container class used by the DMatrix."""
 
-    def run_dispatch(device: str, DMatrixT: Type) -> None:
+    def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
         Df, _ = get_df_impl(device)
         # Basic test with a single feature
         df = Df({"c": ["cdef", "abc"]}, dtype="category")
@@ -86,10 +105,16 @@ def run_dispatch(device: str, DMatrixT: Type) -> None:
         assert_allclose(device, csr.indptr, np.array([0, 1, 1, 2, 3]))
         assert_allclose(device, csr.indices, np.array([0, 0, 0]))
 
+        comp_booster(device, Xy, "gbtree")
+        comp_booster(device, Xy, "dart")
+
         # Test with explicit null-terminated strings.
         df = Df({"c": ["cdef", None, "abc", "abc\0"]}, dtype="category")
         Xy = DMatrixT(df, enable_categorical=True)
 
+        comp_booster(device, Xy, "gbtree")
+        comp_booster(device, Xy, "dart")
+
     for dm in (DMatrix, QuantileDMatrix):
         run_dispatch(device, dm)
 
@@ -129,6 +154,7 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
                 assert cats[fname] is None
 
         if not hasattr(Xy, "ref"):  # not quantile DMatrix.
+            assert not isinstance(Xy, QuantileDMatrix)
             with tempfile.TemporaryDirectory() as tmpdir:
                 fname = os.path.join(tmpdir, "DMatrix.binary")
                 Xy.save_binary(fname)
@@ -144,6 +170,9 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
                     else:
                         assert v_0.to_pylist() == v_1.to_pylist()
 
+        comp_booster(device, Xy, "gbtree")
+        comp_booster(device, Xy, "dart")
+
     def run_dispatch(DMatrixT: Type) -> None:
         # full str type
         X, y = make_categorical(
@@ -216,6 +245,15 @@ def run_dispatch(DMatrixT: Type) -> None:
     for dm in (DMatrix, QuantileDMatrix):
         run_dispatch(dm)
 
+    batches = make_batches(
+        n_samples_per_batch=128, n_features=4, n_batches=1, use_cupy=device == "cuda"
+    )
+    X, y, w = map(lambda x: x[0], batches)
+    Xy = DMatrix(X, y, weight=w)
+    assert Xy.get_categories() is None
+    Xy = QuantileDMatrix(X, y, weight=w)
+    assert Xy.get_categories() is None
+
 
 def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
     """Test the categories container for iterator-based inputs."""
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index e5b07bbe9295..a6ba4c693698 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -700,52 +700,66 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
   API_END();
 }
 
-XGB_DLL int XGBDMatrixGetCategories(DMatrixHandle handle, char const **out) {
+namespace {
+template <typename FidxT>
+void GetCategoriesImpl(enc::HostColumnsView const &cats, FidxT n_features,
+                       std::string *p_out_storage, char const **out) {
+  auto &ret_str = *p_out_storage;
+  if (cats.Empty()) {
+    ret_str.clear();
+    *out = nullptr;
+    return;
+  }
+
   // We can directly use the storage in the cat container instead of allocating temporary storage.
+  Json jout{Array{}};
+  for (decltype(n_features) f_idx = 0; f_idx < n_features; ++f_idx) {
+    auto const &col = cats[f_idx];
+    if (std::visit([](auto &&arg) { return arg.empty(); }, col)) {
+      get<Array>(jout).emplace_back();
+      continue;
+    }
+    std::visit(enc::Overloaded{[&](enc::CatStrArrayView const &str) {
+                                 auto const &offsets = str.offsets;
+                                 auto ovec = linalg::MakeVec(offsets.data(), offsets.size());
+                                 auto jovec = linalg::ArrayInterface(ovec);
+
+                                 auto const &values = str.values;
+                                 auto dvec = linalg::MakeVec(values.data(), values.size());
+                                 auto jdvec = linalg::ArrayInterface(dvec);
+
+                                 get<Array>(jout).emplace_back(Object{});
+                                 get<Array>(jout).back()["offsets"] = std::move(jovec);
+                                 get<Array>(jout).back()["values"] = std::move(jdvec);
+                               },
+                               [&](auto &&values) {
+                                 auto vec = linalg::MakeVec(values.data(), values.size());
+                                 auto jvec = linalg::ArrayInterface(vec);
+                                 get<Array>(jout).emplace_back(std::move(jvec));
+                               }},
+               col);
+  }
+  auto str = Json::Dump(jout);
+  ret_str = std::move(str);
+
+  *out = ret_str.c_str();
+}
+}  // anonymous namespace
+
+/**
+ * Experimental (3.1), hidden.
+ */
+XGB_DLL int XGBDMatrixGetCategories(DMatrixHandle handle, char const **out) {
   API_BEGIN()
   CHECK_HANDLE()
   auto const p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
   auto const cats = p_fmat->Cats()->HostView();
+  auto n_features = p_fmat->Info().num_col_;
 
   auto &ret_str = p_fmat->GetThreadLocal().ret_str;
   xgboost_CHECK_C_ARG_PTR(out);
 
-  if (cats.Empty()) {
-    *out = nullptr;
-  } else {
-    Json jout{Array{}};
-    auto n_features = p_fmat->Info().num_col_;
-    for (decltype(n_features) f_idx = 0; f_idx < n_features; ++f_idx) {
-      auto const &col = cats[f_idx];
-      if (std::visit([](auto &&arg) { return arg.empty(); }, col)) {
-        get<Array>(jout).emplace_back();
-        continue;
-      }
-      std::visit(enc::Overloaded{[&](enc::CatStrArrayView const &str) {
-                                   auto const &offsets = str.offsets;
-                                   auto ovec = linalg::MakeVec(offsets.data(), offsets.size());
-                                   auto jovec = linalg::ArrayInterface(ovec);
-
-                                   auto const &values = str.values;
-                                   auto dvec = linalg::MakeVec(values.data(), values.size());
-                                   auto jdvec = linalg::ArrayInterface(dvec);
-
-                                   get<Array>(jout).emplace_back(Object{});
-                                   get<Array>(jout).back()["offsets"] = std::move(jovec);
-                                   get<Array>(jout).back()["values"] = std::move(jdvec);
-                                 },
-                                 [&](auto &&values) {
-                                   auto vec = linalg::MakeVec(values.data(), values.size());
-                                   auto jvec = linalg::ArrayInterface(vec);
-                                   get<Array>(jout).emplace_back(std::move(jvec));
-                                 }},
-                 col);
-    }
-    auto str = Json::Dump(jout);
-    ret_str = std::move(str);
-
-    *out = ret_str.c_str();
-  }
+  GetCategoriesImpl(cats, n_features, &ret_str, out);
 
   API_END()
 }
@@ -1669,6 +1683,24 @@ XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle,
   API_END();
 }
 
+/**
+ * Experimental (3.1), hidden.
+ */
+XGB_DLL int XGBoosterGetCategories(BoosterHandle handle, char const **out) {
+  API_BEGIN()
+  CHECK_HANDLE()
+  auto *bst = static_cast<Learner *>(handle);
+  auto const cats = bst->Cats()->HostView();
+  auto n_features = bst->GetNumFeature();
+
+  auto &ret_str = bst->GetThreadLocal().ret_str;
+  xgboost_CHECK_C_ARG_PTR(out);
+
+  GetCategoriesImpl(cats, n_features, &ret_str, out);
+
+  API_END()
+}
+
 XGB_DLL int XGBoosterGetAttr(BoosterHandle handle, const char *key, const char **out,
                              int *success) {
   auto* bst = static_cast<Learner*>(handle);
diff --git a/src/gbm/gbm.cc b/src/gbm/gbm.cc
index 9f88e30f59df..ecee708ecb8e 100644
--- a/src/gbm/gbm.cc
+++ b/src/gbm/gbm.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2025, XGBoost Contributors
  * \file gbm.cc
  * \brief Registry of gradient boosters.
  */
@@ -7,9 +7,7 @@
 
 #include <dmlc/registry.h>
 
-#include <memory>
 #include <string>
-#include <vector>
 
 #include "xgboost/context.h"
 #include "xgboost/learner.h"
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 8e7f5e135217..658e5d91e3ed 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -281,6 +281,8 @@ class GBTree : public GradientBooster {
     }
   }
 
+  [[nodiscard]] CatContainer const* Cats() const override { return this->model_.Cats(); }
+
   void PredictLeaf(DMatrix* p_fmat,
                    HostDeviceVector<bst_float>* out_preds,
                    uint32_t layer_begin, uint32_t layer_end) override {
diff --git a/src/learner.cc b/src/learner.cc
index b4daa95222de..453351a21030 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -642,6 +642,10 @@ class LearnerConfiguration : public Learner {
     auto& ft = *p_ft;
     ft = this->feature_types_;
   }
+  [[nodiscard]] CatContainer const* Cats() const override {
+    this->CheckModelInitialized();
+    return this->gbm_->Cats();
+  }
 
   std::vector<std::string> GetAttrNames() const override {
     std::vector<std::string> out;

From 804c5e843096489ec9f788049e5081185b54225a Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Fri, 18 Jul 2025 19:19:39 +0800
Subject: [PATCH 106/224] [jvm-packages] Add `setNumEarlyStoppingRounds`
 (#11571)

---
 .../dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala | 8 +++++---
 .../dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala   | 8 +++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
index fa814be224a2..891c7362573d 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
@@ -122,7 +122,7 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
   final val numWorkers = new IntParam(this, "numWorkers", "Number of workers used to train xgboost",
     ParamValidators.gtEq(1))
 
-  final def getNumRound: Int = $(numRound)
+  final def getNumWorkers: Int = $(numWorkers)
 
   final val forceRepartition = new BooleanParam(this, "forceRepartition", "If the partition " +
     "is equal to numWorkers, xgboost won't repartition the dataset. Set forceRepartition to " +
@@ -133,6 +133,8 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
   final val numRound = new IntParam(this, "numRound", "The number of rounds for boosting",
     ParamValidators.gtEq(1))
 
+  final def getNumRound: Int = $(numRound)
+
   final val numEarlyStoppingRounds = new IntParam(this, "numEarlyStoppingRounds", "Stop training " +
     "Number of rounds of decreasing eval metric to tolerate before stopping training",
     ParamValidators.gtEq(0))
@@ -216,14 +218,14 @@ private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFe
     labelCol, baseMarginCol, weightCol, predictionCol, leafPredictionCol, contribPredictionCol,
     forceRepartition, featuresCols, customEval, customObj, featureTypes, featureNames)
 
-  final def getNumWorkers: Int = $(numWorkers)
-
   def setNumWorkers(value: Int): T = set(numWorkers, value).asInstanceOf[T]
 
   def setForceRepartition(value: Boolean): T = set(forceRepartition, value).asInstanceOf[T]
 
   def setNumRound(value: Int): T = set(numRound, value).asInstanceOf[T]
 
+  def setNumEarlyStoppingRounds(value: Int): T = set(numEarlyStoppingRounds, value).asInstanceOf[T]
+
   def setFeaturesCol(value: Array[String]): T = set(featuresCols, value).asInstanceOf[T]
 
   def setBaseMarginCol(value: String): T = set(baseMarginCol, value).asInstanceOf[T]
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala
index 70b418937338..4a2b54cf15cf 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostParamsSuite.scala
@@ -24,7 +24,6 @@ import org.scalatest.funsuite.AnyFunSuite
 class XGBoostParamsSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
 
   test("invalid parameters") {
-    val df = smallBinaryClassificationVector
     val estimator = new XGBoostClassifier()
 
     // We didn't set it by default
@@ -52,4 +51,11 @@ class XGBoostParamsSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite
     assert(v1 == 0.66f)
   }
 
+  test("setNumEarlyStoppingRounds") {
+    val estimator = new XGBoostClassifier()
+    assert(estimator.getNumEarlyStoppingRounds == 0)
+    estimator.setNumEarlyStoppingRounds(10)
+    assert(estimator.getNumEarlyStoppingRounds == 10)
+  }
+
 }

From b11d452ea89d8da4a5fed40526d913b4abb4259c Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Fri, 18 Jul 2025 18:52:55 +0200
Subject: [PATCH 107/224] [sycl] Improve L1 cache locality for histogram
 building. (#11555)

---------

Co-authored-by: Dmitry Razdoburdin <>
---
 ops/conda_env/linux_sycl_test.yml           |   2 +-
 plugin/sycl/common/hist_util.cc             | 176 +++++++++++-----
 plugin/sycl/common/hist_util.h              |   4 +-
 plugin/sycl/data/gradient_index.cc          |  10 +-
 plugin/sycl/data/gradient_index.h           |   1 +
 plugin/sycl/tree/hist_dispatcher.h          | 214 ++++++++++++++++++++
 plugin/sycl/tree/hist_updater.cc            |  16 +-
 plugin/sycl/tree/hist_updater.h             |  11 +-
 tests/cpp/plugin/test_sycl_ghist_builder.cc |   4 +-
 9 files changed, 379 insertions(+), 59 deletions(-)
 create mode 100644 plugin/sycl/tree/hist_dispatcher.h

diff --git a/ops/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml
index 528649586cfe..c9390ff22a0b 100644
--- a/ops/conda_env/linux_sycl_test.yml
+++ b/ops/conda_env/linux_sycl_test.yml
@@ -20,6 +20,6 @@ dependencies:
 - pytest-cov
 - dask=2024.11
 - ninja
-- dpcpp_linux-64
+- dpcpp_linux-64>=2024.2.1
 - onedpl-devel
 - intel-openmp
diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
index 8f572e54c442..5b96d8f5c98e 100644
--- a/plugin/sycl/common/hist_util.cc
+++ b/plugin/sycl/common/hist_util.cc
@@ -7,6 +7,7 @@
 #include <algorithm>
 
 #include "../data/gradient_index.h"
+#include "../tree/hist_dispatcher.h"
 #include "hist_util.h"
 
 #include <sycl/sycl.hpp>
@@ -91,28 +92,27 @@ template ::sycl::event SubtractionHist(::sycl::queue* qu,
                               const GHistRow<double, MemoryType::on_device>& src2,
                               size_t size, ::sycl::event event_priv);
 
-inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) {
-  struct _ {
-    size_t block_size, nblocks;
-  };
+template <typename GradientPairT>
+::sycl::event ReduceHist(::sycl::queue* qu, GradientPairT* hist_data,
+                         GradientPairT* hist_buffer_data,
+                         size_t  nblocks, size_t nbins,
+                         const ::sycl::event& event_main) {
+  auto event_save = qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_main);
+    cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
+      size_t idx_bin = pid.get_id(0);
 
-  const size_t min_block_size = 32;
-  const size_t max_compute_units =
-    qu->get_device().get_info<::sycl::info::device::max_compute_units>();
+      GradientPairT gpair = {0, 0};
 
-  size_t nblocks = max_compute_units;
+      for (size_t j = 0; j < nblocks; ++j) {
+        gpair += hist_buffer_data[j * nbins + idx_bin];
+      }
 
-  size_t block_size = size / nblocks + !!(size % nblocks);
-  if (block_size > (1u << 12)) {
-    nblocks = max_nblocks;
-    block_size = size / nblocks + !!(size % nblocks);
-  }
-  if (block_size < min_block_size) {
-    block_size = min_block_size;
-    nblocks = size / block_size + !!(size % block_size);
-  }
+      hist_data[idx_bin] = gpair;
+    });
+  });
 
-  return _{block_size, nblocks};
+  return event_save;
 }
 
 // Kernel with buffer using
@@ -123,6 +123,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const GHistIndexMatrix& gmat,
                             GHistRow<FPType, MemoryType::on_device>* hist,
                             GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            const tree::HistDispatcher<FPType>& dispatcher,
                             ::sycl::event event_priv) {
   using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
   const size_t size = row_indices.Size();
@@ -133,18 +134,13 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
   const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
   const size_t nbins = gmat.nbins;
 
-  const size_t max_work_group_size =
-    qu->get_device().get_info<::sycl::info::device::max_work_group_size>();
-  const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
-
-  // Captured structured bindings are a C++20 extension
-  const auto block_params = GetBlocksParameters(qu, size, hist_buffer->Size() / (nbins * 2));
-  const size_t block_size = block_params.block_size;
-  const size_t nblocks = block_params.nblocks;
+  const size_t work_group_size = dispatcher.work_group_size;
+  const size_t block_size = dispatcher.block.size;
+  const size_t nblocks = dispatcher.block.nblocks;
 
   GradientPairT* hist_buffer_data = hist_buffer->Data();
   auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0),
-                             nblocks * nbins * 2, event_priv);
+                             nblocks * nbins, event_priv);
   auto event_main = qu->submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_fill);
     cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
@@ -179,20 +175,84 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
   });
 
   GradientPairT* hist_data = hist->Data();
-  auto event_save = qu->submit([&](::sycl::handler& cgh) {
-    cgh.depends_on(event_main);
-    cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
-      size_t idx_bin = pid.get_id(0);
+  auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
+                               nbins, event_main);
 
-      GradientPairT gpair = {0, 0};
+  return event_save;
+}
 
-      for (size_t j = 0; j < nblocks; ++j) {
-        gpair += hist_buffer_data[j * nbins + idx_bin];
-      }
+// Kernel with buffer and local hist using
+template<typename FPType, typename BinIdxType>
+::sycl::event BuildHistKernelLocal(::sycl::queue* qu,
+                            const HostDeviceVector<GradientPair>& gpair,
+                            const RowSetCollection::Elem& row_indices,
+                            const GHistIndexMatrix& gmat,
+                            GHistRow<FPType, MemoryType::on_device>* hist,
+                            GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            const tree::HistDispatcher<FPType>& dispatcher,
+                            ::sycl::event event_priv) {
+  constexpr int kMaxNumBins = tree::HistDispatcher<FPType>::KMaxNumBins;
+  using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
+  const size_t size = row_indices.Size();
+  const size_t* rid = row_indices.begin;
+  const size_t n_columns = gmat.nfeatures;
+  const auto* pgh = gpair.ConstDevicePointer();
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
+  const size_t nbins = gmat.nbins;
 
-      hist_data[idx_bin] = gpair;
+  const size_t work_group_size = dispatcher.work_group_size;
+  const size_t block_size = dispatcher.block.size;
+  const size_t nblocks = dispatcher.block.nblocks;
+
+  GradientPairT* hist_buffer_data = hist_buffer->Data();
+
+  auto event_main = qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_priv);
+    cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
+                                           ::sycl::range<2>(1, work_group_size)),
+                       [=](::sycl::nd_item<2> pid) {
+      size_t block = pid.get_global_id(0);
+      size_t feat = pid.get_global_id(1);
+
+      // This buffer will be keeped in L1/registers
+      GradientPairT hist_fast[kMaxNumBins];
+
+      GradientPairT* hist_local = hist_buffer_data + block * nbins;
+      for (size_t fid = feat; fid < n_columns; fid += work_group_size) {
+        size_t n_bins_feature = offsets[fid+1] - offsets[fid];
+
+        // Not all elements of hist_fast are actually used: n_bins_feature <= kMaxNumBins
+        // We initililize only the requared elements to prevent the unused go to cache.
+        for (int bin = 0; bin < n_bins_feature; ++bin) {
+          hist_fast[bin] = {0, 0};
+        }
+
+        for (size_t idx = 0; idx < block_size; ++idx) {
+          size_t i = block * block_size + idx;
+          if (i < size) {
+            size_t row_id = rid[i];
+
+            const size_t icol_start = n_columns * row_id;
+            const GradientPairT pgh_row(pgh[row_id].GetGrad(),
+                                        pgh[row_id].GetHess());
+
+            const BinIdxType* gr_index_local = gradient_index + icol_start;
+            uint32_t idx_bin = gr_index_local[fid];
+
+            hist_fast[idx_bin] += pgh_row;
+          }
+        }
+        for (int bin = 0 ; bin < n_bins_feature; ++bin) {
+          hist_local[bin + offsets[fid]] = hist_fast[bin];
+        }
+      }
     });
   });
+
+  GradientPairT* hist_data = hist->Data();
+  auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
+                               nbins, event_main);
   return event_save;
 }
 
@@ -203,6 +263,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const RowSetCollection::Elem& row_indices,
                             const GHistIndexMatrix& gmat,
                             GHistRow<FPType, MemoryType::on_device>* hist,
+                            const tree::HistDispatcher<FPType>& dispatcher,
                             ::sycl::event event_priv) {
   const size_t size = row_indices.Size();
   const size_t* rid = row_indices.begin;
@@ -214,7 +275,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
   FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
   const size_t nbins = gmat.nbins;
 
-  constexpr size_t work_group_size = 32;
+  size_t work_group_size = dispatcher.work_group_size;
   const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
 
   auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv);
@@ -260,34 +321,47 @@ ::sycl::event BuildHistDispatchKernel(
                 GHistRow<FPType, MemoryType::on_device>* hist,
                 bool isDense,
                 GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                const tree::DeviceProperties& device_prop,
                 ::sycl::event events_priv,
                 bool force_atomic_use) {
   const size_t size = row_indices.Size();
   const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
   const size_t nbins = gmat.nbins;
+  const size_t max_num_bins = gmat.max_num_bins;
+  const size_t min_num_bins = gmat.min_num_bins;
 
-  // TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
-  bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
+  size_t max_n_blocks = hist_buffer->Size() / nbins;
+  auto dispatcher = tree::HistDispatcher<FPType>
+                       (device_prop, isDense, size, max_n_blocks, nbins,
+                        n_columns, max_num_bins, min_num_bins);
 
   // force_atomic_use flag is used only for testing
-  use_atomic = use_atomic || force_atomic_use;
+  bool use_atomic = dispatcher.use_atomics || force_atomic_use;
   if (!use_atomic) {
     if (isDense) {
-      return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
-                                                       gmat, hist, hist_buffer,
-                                                       events_priv);
+      if (dispatcher.use_local_hist) {
+        return BuildHistKernelLocal<FPType, BinIdxType>(qu, gpair, row_indices,
+                                                        gmat, hist, hist_buffer,
+                                                        dispatcher, events_priv);
+      } else {
+        return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
+                                                         gmat, hist, hist_buffer,
+                                                         dispatcher, events_priv);
+      }
     } else {
       return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
                                                       gmat, hist, hist_buffer,
-                                                      events_priv);
+                                                      dispatcher, events_priv);
     }
   } else {
     if (isDense) {
       return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
-                                                       gmat, hist, events_priv);
+                                                       gmat, hist,
+                                                       dispatcher, events_priv);
     } else {
       return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
-                                                      gmat, hist, events_priv);
+                                                      gmat, hist,
+                                                      dispatcher, events_priv);
     }
   }
 }
@@ -299,6 +373,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const GHistIndexMatrix& gmat, const bool isDense,
                             GHistRow<FPType, MemoryType::on_device>* hist,
                             GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            const tree::DeviceProperties& device_prop,
                             ::sycl::event event_priv,
                             bool force_atomic_use) {
   const bool is_dense = isDense;
@@ -306,16 +381,19 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
     case BinTypeSize::kUint8BinsTypeSize:
       return BuildHistDispatchKernel<FPType, uint8_t>(qu, gpair, row_indices,
                                                       gmat, hist, is_dense, hist_buffer,
+                                                      device_prop,
                                                       event_priv, force_atomic_use);
       break;
     case BinTypeSize::kUint16BinsTypeSize:
       return BuildHistDispatchKernel<FPType, uint16_t>(qu, gpair, row_indices,
                                                        gmat, hist, is_dense, hist_buffer,
+                                                       device_prop,
                                                        event_priv, force_atomic_use);
       break;
     case BinTypeSize::kUint32BinsTypeSize:
       return BuildHistDispatchKernel<FPType, uint32_t>(qu, gpair, row_indices,
                                                        gmat, hist, is_dense, hist_buffer,
+                                                       device_prop,
                                                        event_priv, force_atomic_use);
       break;
     default:
@@ -331,10 +409,12 @@ ::sycl::event GHistBuilder<GradientSumT>::BuildHist(
               GHistRowT<MemoryType::on_device>* hist,
               bool isDense,
               GHistRowT<MemoryType::on_device>* hist_buffer,
+              const tree::DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use) {
   return BuildHistKernel<GradientSumT>(qu_, gpair, row_indices, gmat,
-                                       isDense, hist, hist_buffer, event_priv,
+                                       isDense, hist, hist_buffer,
+                                       device_prop, event_priv,
                                        force_atomic_use);
 }
 
@@ -346,6 +426,7 @@ ::sycl::event GHistBuilder<float>::BuildHist(
               GHistRow<float, MemoryType::on_device>* hist,
               bool isDense,
               GHistRow<float, MemoryType::on_device>* hist_buffer,
+              const tree::DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use);
 template
@@ -356,6 +437,7 @@ ::sycl::event GHistBuilder<double>::BuildHist(
               GHistRow<double, MemoryType::on_device>* hist,
               bool isDense,
               GHistRow<double, MemoryType::on_device>* hist_buffer,
+              const tree::DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use);
 
diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
index d09678d4b662..c2148c6a612e 100644
--- a/plugin/sycl/common/hist_util.h
+++ b/plugin/sycl/common/hist_util.h
@@ -14,6 +14,7 @@
 
 #include "../../src/common/hist_util.h"
 #include "../data/gradient_index.h"
+#include "../tree/hist_dispatcher.h"
 
 #include <sycl/sycl.hpp>
 
@@ -123,7 +124,7 @@ class ParallelGHistBuilder {
   }
 
   void Reset(size_t nblocks) {
-    hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2);
+    hist_device_buffer_.Resize(qu_, nblocks * nbins_);
   }
 
   GHistRowT& GetDeviceBuffer() {
@@ -161,6 +162,7 @@ class GHistBuilder {
                           GHistRowT<MemoryType::on_device>* HistCollection,
                           bool isDense,
                           GHistRowT<MemoryType::on_device>* hist_buffer,
+                          const tree::DeviceProperties& device_prop,
                           ::sycl::event event,
                           bool force_atomic_use = false);
 
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
index d4591803cde2..2c8e988dfbaf 100644
--- a/plugin/sycl/data/gradient_index.cc
+++ b/plugin/sycl/data/gradient_index.cc
@@ -121,6 +121,14 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
   max_num_bins = max_bins;
   nbins = cut.Ptrs().back();
 
+  min_num_bins = nbins;
+  const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
+  for (unsigned fid = 0; fid < n_offsets; ++fid) {
+    auto ibegin = cut.cut_ptrs_.ConstHostVector()[fid];
+    auto iend = cut.cut_ptrs_.ConstHostVector()[fid + 1];
+    min_num_bins = std::min<size_t>(min_num_bins, iend - ibegin);
+  }
+
   hit_count.SetDevice(ctx->Device());
   hit_count.Resize(nbins, 0);
 
@@ -141,7 +149,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
     row_stride = nfeatures;
     n_rows = dmat->Info().num_row_;
   }
-  const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
+
   const size_t n_index = n_rows * row_stride;
   ResizeIndex(qu, n_index);
 
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
index 46e56f70ad73..967ac9a87f9e 100644
--- a/plugin/sycl/data/gradient_index.h
+++ b/plugin/sycl/data/gradient_index.h
@@ -86,6 +86,7 @@ struct GHistIndexMatrix {
   /*! \brief The corresponding cuts */
   xgboost::common::HistogramCuts cut;
   size_t max_num_bins;
+  size_t min_num_bins;
   size_t nbins;
   size_t nfeatures;
   size_t row_stride;
diff --git a/plugin/sycl/tree/hist_dispatcher.h b/plugin/sycl/tree/hist_dispatcher.h
new file mode 100644
index 000000000000..5552a0799ae2
--- /dev/null
+++ b/plugin/sycl/tree/hist_dispatcher.h
@@ -0,0 +1,214 @@
+/*!
+ * Copyright 2017-2025 by Contributors
+ * \file hist_dispatcher.h
+ */
+#ifndef PLUGIN_SYCL_TREE_HIST_DISPATCHER_H_
+#define PLUGIN_SYCL_TREE_HIST_DISPATCHER_H_
+
+#include <algorithm>
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/device_architecture.hpp>
+
+#include "../../../src/common/common.h"               // for HumanMemUnit
+
+namespace xgboost {
+namespace sycl {
+namespace tree {
+
+class DeviceProperties {
+  void GetL2Size(const ::sycl::device& device) {
+    size_t l2_size = device.get_info<::sycl::info::device::global_mem_cache_size>();
+    LOG(INFO) << "Detected L2 Size = " << ::xgboost::common::HumanMemUnit(l2_size);
+    l2_size_per_eu = static_cast<float>(l2_size) / max_compute_units;
+  }
+
+  void GetSRAMSize(const ::sycl::device& device) {
+    auto arch =
+      device.get_info<::sycl::ext::oneapi::experimental::info::device::architecture>();
+    size_t eu_per_core =
+      device.get_info<::sycl::ext::intel::info::device::gpu_eu_count_per_subslice>();
+    switch (arch) {
+      case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc: {
+        LOG(INFO) << "Xe-HPC (Ponte Vecchio) Architecture. L1 friendly optimization enabled.";
+        size_t l1_size = 512 * 1024;
+        size_t registers_size = 64 * 1024;
+        sram_size_per_eu = l1_size  / eu_per_core + registers_size;
+        break;
+      }
+      default:
+        sram_size_per_eu = 0;
+    }
+  }
+
+ public:
+  bool is_gpu;
+  size_t max_compute_units;
+  size_t max_work_group_size;
+  float sram_size_per_eu = 0;
+  float l2_size_per_eu = 0;
+
+  explicit DeviceProperties(const ::sycl::device& device):
+    is_gpu(device.is_gpu()),
+    max_compute_units(device.get_info<::sycl::info::device::max_compute_units>()),
+    max_work_group_size(device.get_info<::sycl::info::device::max_work_group_size>()) {
+      GetL2Size(device);
+      if (is_gpu) {
+        GetSRAMSize(device);
+      }
+    }
+};
+
+struct BlockParams { size_t size, nblocks; };
+
+template <typename FPType>
+class HistDispatcher {
+ public:
+  // Max n_blocks/max_compute_units ration.
+  // Higher -> better GPU utilisation with higer memory overhead.
+  constexpr static int kMaxGPUUtilisation = 4;
+  // Minimal value of block size for buffer-based hist building
+  constexpr static size_t KMinBlockSize = 32;
+  // Maximal value of block size, when increasing can affect performance
+  constexpr static size_t KMaxEffectiveBlockSize = 1u << 11;
+  // Maximal number of bins acceptable for local histograms
+  constexpr static size_t KMaxNumBins = 256;
+  // Amount of sram for local-histogram kernel launch
+  constexpr static float KLocalHistSRAM = 32. * 1024;
+  // Max workgroups size, used by atomic-based hist-building
+  constexpr static size_t kMaxWorkGroupSizeAtomic = 32;
+  // Max workgroups size, used for local histograms
+  constexpr static size_t kMaxWorkGroupSizeLocal = 256;
+  // Atomic efficency normalization
+  constexpr static float kAtomicEfficiencyNormalization = 16 * 1024;
+  // Block kernel launch penalty normalization
+  constexpr static float kBlockPenaltyNormalization = 32 * 1024;
+  // Relative weight of quadratic term in atomic penalty model
+  constexpr static float kAtomicQuadraticWeight = 1.0 / 8.0;
+  // Minimal value of threshold GPU load
+  constexpr static float kMinTh = 1.0 / 16.0;
+
+  bool use_local_hist = false;
+  bool use_atomics = false;
+  size_t work_group_size;
+  BlockParams block;
+
+  inline BlockParams GetBlocksParameters(size_t size, size_t max_nblocks,
+                                         size_t max_compute_units) const {
+    if (max_nblocks == 0) return {0, 0};
+    size_t nblocks = max_compute_units;
+
+    size_t block_size = size / nblocks + !!(size % nblocks);
+    while (block_size > (1u << 11)) {
+      nblocks *= 2;
+      if (nblocks >= max_nblocks) {
+        nblocks = max_nblocks;
+        block_size = size / nblocks + !!(size % nblocks);
+        break;
+      }
+      block_size = size / nblocks + !!(size % nblocks);
+    }
+
+    if (block_size < KMinBlockSize) {
+      block_size = KMinBlockSize;
+      nblocks = size / block_size + !!(size % block_size);
+    }
+
+    return {block_size, nblocks};
+  }
+
+  HistDispatcher(const DeviceProperties& device_prop, bool isDense, size_t size,
+                 size_t max_nblocks, size_t nbins, size_t ncolumns,
+                 size_t max_num_bins, size_t min_num_bins) {
+    block = GetBlocksParameters(size, max_nblocks, device_prop.max_compute_units);
+    work_group_size = std::min(ncolumns, device_prop.max_work_group_size);
+    if (!device_prop.is_gpu) return;
+
+    using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
+    /* If local histogram is possible and beneficial */
+    const int buff_size = nbins * sizeof(GradientPairT);
+    /* block_size writes into array of size max_num_bins are made,
+    * if (block_size < max_num_bins)
+    * most part of buffer isn't used and perf suffers.
+    */
+    const size_t th_block_size = max_num_bins;
+    use_local_hist = (buff_size < device_prop.sram_size_per_eu - KLocalHistSRAM)
+                      && isDense
+                      && (max_num_bins <= KMaxNumBins)
+                      && (block.size >= th_block_size);
+
+    /* Predict penalty from atomic usage and compare with one from block-based build with buffer */
+    // EUs processing different columns do not trigger conflicts.
+    float wg_per_columns = std::max(1.0f, static_cast<float>(ncolumns) / kMaxWorkGroupSizeAtomic);
+    /* Rows are processed per execution unit.
+    * Some EUs process different columns, and don't triiger conflicts.
+    * We use a worse case scenario, i.e. use the minimal number of bins per feature
+    */
+    float conflicts_per_bin = (device_prop.max_compute_units / wg_per_columns) / min_num_bins;
+
+    // Atomics resolve conflicts between EUs, so L2 size can be a proxy for atomic efficiency.
+    float atomic_efficency = device_prop.l2_size_per_eu / kAtomicEfficiencyNormalization;
+    // We use simple quadratic model to predict atomic penalty
+    float atomic_penalty = conflicts_per_bin
+                        + kAtomicQuadraticWeight * (conflicts_per_bin * conflicts_per_bin);
+
+    // Block-based builder operates with buffer of type FPType, placed in L2.
+    float base_block_penalty = kBlockPenaltyNormalization /
+                                device_prop.l2_size_per_eu * (sizeof(FPType) / 4);
+
+    if (block.nblocks >= device_prop.max_compute_units) {
+      // if GPU is fully loaded, we can simply compare penaltys.
+      use_atomics = base_block_penalty > atomic_penalty / atomic_efficency;
+    } else {
+      float blocks_per_eu = static_cast<float>(block.nblocks) / device_prop.max_compute_units;
+      /* The GPU is not 100% loaded. We need to take this into account in our model:
+      * block_penalty = base_block_penalty + base_time * (1 - blocks_per_eu);
+      *
+      * atomics should be used, if:
+      * block_penalty > atomic_penalty
+      *
+      * The normalization is chosen so that: base_time = 1
+      * base_block_penalty + 1 - blocks_per_eu > atomic_penalty / atomic_efficency
+      *
+      * blocks_per_eu < 1 + base_block_penalty - atomic_penalty / atomic_efficency
+      */
+      float th_block_per_eu = 1 + base_block_penalty - atomic_penalty / atomic_efficency;
+
+      /* We can't trust the decision of the approximate performance model
+      * if penalties are close to each other
+      * i.e. (1 + base_block_penalty) ~ (atomic_penalty / atomic_efficency)
+      * We manually limit the minimal value of th_block_per_eu,
+      * to determine the behaviour in this region.
+      */
+      th_block_per_eu = std::max<float>(kMinTh, th_block_per_eu);
+
+      use_atomics = (blocks_per_eu < th_block_per_eu);
+    }
+
+    if (use_atomics) {
+      work_group_size = std::min(kMaxWorkGroupSizeAtomic,
+                                 work_group_size);
+    } else if (use_local_hist) {
+      work_group_size = std::min(kMaxWorkGroupSizeLocal,
+                                 work_group_size);
+    }
+  }
+};
+
+// For some datasets buffer is not used, we estimate if it is the case.
+template<typename FPType>
+size_t GetRequiredBufferSize(const DeviceProperties& device_prop, size_t max_n_rows, size_t nbins,
+                             size_t ncolumns, size_t max_num_bins, size_t min_num_bins) {
+  size_t max_nblocks = HistDispatcher<FPType>::kMaxGPUUtilisation * device_prop.max_compute_units;
+  // Buffer size doesn't depend on isDense flag.
+  auto build_params = HistDispatcher<FPType>
+                      (device_prop, true, max_n_rows, max_nblocks, nbins,
+                       ncolumns, max_num_bins, min_num_bins);
+
+  return build_params.use_atomics ? 0 : build_params.block.nblocks;
+}
+
+}  // namespace tree
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_TREE_HIST_DISPATCHER_H_
diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
index 7009543dabb6..9a1510db4971 100644
--- a/plugin/sycl/tree/hist_updater.cc
+++ b/plugin/sycl/tree/hist_updater.cc
@@ -500,10 +500,6 @@ void HistUpdater<GradientSumT>::InitData(
     hist_.Init(qu_, nbins);
     hist_local_worker_.Init(qu_, nbins);
 
-    hist_buffer_.Init(qu_, nbins);
-    size_t buffer_size = kBufferSize;
-    hist_buffer_.Reset(kBufferSize);
-
     // initialize histogram builder
     hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
 
@@ -613,6 +609,18 @@ void HistUpdater<GradientSumT>::InitData(
       qexpand_depth_wise_.clear();
     }
   }
+
+  {
+    uint32_t nbins = gmat.cut.Ptrs().back();
+    hist_buffer_.Init(qu_, nbins);
+    bool isDense = data_layout_ != kSparseData;
+    const size_t ncolumns = isDense ? gmat.nfeatures : gmat.row_stride;
+    size_t buffer_size = GetRequiredBufferSize<GradientSumT>
+                         (device_properties_, info.num_row_, nbins, ncolumns,
+                          gmat.max_num_bins, gmat.min_num_bins);
+    hist_buffer_.Reset(buffer_size);
+  }
+
   builder_monitor_.Stop("InitData");
 }
 
diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h
index 6828c27a60d6..6d65bd1fb51e 100644
--- a/plugin/sycl/tree/hist_updater.h
+++ b/plugin/sycl/tree/hist_updater.h
@@ -20,6 +20,7 @@
 #include "split_evaluator.h"
 #include "hist_synchronizer.h"
 #include "hist_row_adder.h"
+#include "hist_dispatcher.h"
 
 #include "../../src/common/random.h"
 #include "../data.h"
@@ -56,7 +57,7 @@ class HistUpdater {
                        const xgboost::tree::TrainParam& param,
                        FeatureInteractionConstraintHost int_constraints_,
                        DMatrix const* fmat)
-    : ctx_(ctx), qu_(qu), param_(param),
+    : ctx_(ctx), qu_(qu), device_properties_(qu->get_device()), param_(param),
       tree_evaluator_(qu, param, fmat->Info().num_col_),
       interaction_constraints_{std::move(int_constraints_)},
       p_last_tree_(nullptr), p_last_fmat_(fmat) {
@@ -134,7 +135,8 @@ class HistUpdater {
                         GHistRowT<MemoryType::on_device>* hist_buffer,
                         ::sycl::event event_priv) {
     return hist_builder_.BuildHist(gpair, row_indices, gmat, hist,
-                                   data_layout_ != kSparseData, hist_buffer, event_priv);
+                                   data_layout_ != kSparseData, hist_buffer,
+                                   device_properties_, event_priv);
   }
 
   void InitNewNode(int nid,
@@ -198,9 +200,12 @@ class HistUpdater {
 
   //  --data fields--
   const Context* ctx_;
+  ::sycl::queue* qu_;
   bool has_fp64_support_;
   size_t sub_group_size_;
 
+  DeviceProperties device_properties_;
+
   // the internal row sets
   common::RowSetCollection row_set_collection_;
 
@@ -230,7 +235,6 @@ class HistUpdater {
   enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
   DataLayout data_layout_;
 
-  constexpr static size_t kBufferSize = 2048;
   common::GHistBuilder<GradientSumT> hist_builder_;
   common::ParallelGHistBuilder<GradientSumT> hist_buffer_;
   /*! \brief culmulative histogram of gradients. */
@@ -263,7 +267,6 @@ class HistUpdater {
   std::unique_ptr<HistRowsAdder<GradientSumT>> hist_rows_adder_;
 
   std::vector<GradientPairT> reduce_buffer_;
-  ::sycl::queue* qu_;
 };
 
 }  // namespace tree
diff --git a/tests/cpp/plugin/test_sycl_ghist_builder.cc b/tests/cpp/plugin/test_sycl_ghist_builder.cc
index 95ba4d6f1e14..abf48c4c8cdc 100644
--- a/tests/cpp/plugin/test_sycl_ghist_builder.cc
+++ b/tests/cpp/plugin/test_sycl_ghist_builder.cc
@@ -11,6 +11,7 @@
 
 #include "../../../plugin/sycl/common/hist_util.h"
 #include "../../../plugin/sycl/device_manager.h"
+#include "../../../plugin/sycl/tree/hist_dispatcher.h"
 #include "sycl_helpers.h"
 #include "../helpers.h"
 
@@ -67,8 +68,9 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) {
   InitHist(qu, &hist, hist.Size(), &event);
   InitHist(qu, &hist_buffer, hist_buffer.Size(), &event);
 
+  tree::DeviceProperties device_prop(qu->get_device());
   event = builder.BuildHist(gpair, row_set_collection[0], gmat_sycl, &hist,
-                            sparsity < eps , &hist_buffer, event, force_atomic_use);
+                            sparsity < eps , &hist_buffer, device_prop, event, force_atomic_use);
   qu->memcpy(hist_host.data(), hist.Data(),
             2 * n_bins * sizeof(GradientSumT), event);
   qu->wait_and_throw();

From e5a080bdba6cc97992a43ac97fd16b8551728a28 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 22 Jul 2025 13:07:55 +0800
Subject: [PATCH 108/224] Fix check in GPU histogram. (#11574)

---
 src/tree/gpu_hist/feature_groups.cuh      |  7 ++++++-
 src/tree/gpu_hist/histogram.cu            | 19 ++++++++++---------
 tests/cpp/tree/gpu_hist/test_histogram.cu |  7 ++++++-
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh
index 37d87a9f577a..546b0d4c9247 100644
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #ifndef FEATURE_GROUPS_CUH_
 #define FEATURE_GROUPS_CUH_
@@ -59,6 +59,11 @@ struct FeatureGroupsAccessor {
     return {feature_segments[i], feature_segments[i + 1] - feature_segments[i], bin_segments[i],
             bin_segments[i + 1] - bin_segments[i]};
   }
+  /** @brief The needed shared memory size for the largest group. */
+  [[nodiscard]] std::size_t ShmemSize() const {
+    return sizeof(GradientPairInt64) * this->max_group_bins;
+    ;
+  }
 };
 
 /**
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index a060cf2aded8..3475509fd50f 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -334,23 +334,23 @@ struct HistogramKernel {
   bool shared{false};
   std::array<std::uint32_t, 6> grid_sizes{0, 0, 0, 0, 0, 0};
   std::size_t smem_size{0};
+  std::size_t const max_shared_memory;
   bool const force_global;
 
   HistogramKernel(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
                   bool force_global_memory)
-      : force_global{force_global_memory} {
+      : max_shared_memory{dh::MaxSharedMemoryOptin(ctx->Ordinal())},
+        force_global{force_global_memory} {
     // Decide whether to use shared memory
     // Opt into maximum shared memory for the kernel if necessary
-    std::size_t max_shared_memory = dh::MaxSharedMemoryOptin(ctx->Ordinal());
-
-    this->smem_size = sizeof(GradientPairInt64) * feature_groups.max_group_bins;
-    this->shared = !force_global_memory && this->smem_size <= max_shared_memory;
+    this->smem_size = feature_groups.ShmemSize();
+    this->shared = !force_global_memory && this->smem_size <= this->max_shared_memory;
     this->smem_size = this->shared ? this->smem_size : 0;
 
     auto init = [&](auto& kernel, KernelType k) {
       if (this->shared) {
         dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                           max_shared_memory));
+                                           this->max_shared_memory));
       }
 
       // determine the launch configuration
@@ -422,11 +422,12 @@ class DeviceHistogramDispatchAccessor {
     if (!this->kernel_->shared) {  // Use global memory
       CHECK_EQ(this->kernel_->smem_size, 0);
       if (matrix.IsDense()) {
-        CHECK(this->kernel_->force_global);
+        CHECK(this->kernel_->force_global ||
+              (feature_groups.ShmemSize() >= this->kernel_->max_shared_memory));
         launcher(this->kernel_->global_dense_kernel, this->kernel_->grid_sizes[K::kGlobalDense]);
       } else if (matrix.IsDenseCompressed()) {
-        // Dense must use shared memory except for testing.
-        CHECK(this->kernel_->force_global);
+        CHECK(this->kernel_->force_global ||
+              (feature_groups.ShmemSize() >= this->kernel_->max_shared_memory));
         launcher(this->kernel_->global_compr_kernel, this->kernel_->grid_sizes[K::kGlobalCompr]);
       } else {
         // Sparse
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index b7b1344ae1b7..e400fc1315c8 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -280,7 +280,7 @@ void ValidateCategoricalHistogram(size_t n_categories, common::Span<GradientPair
 // Test 1 vs rest categorical histogram is equivalent to one hot encoded data.
 void TestGPUHistogramCategorical(size_t num_categories) {
   auto ctx = MakeCUDACtx(0);
-  size_t constexpr kRows = 340;
+  size_t kRows = std::max(static_cast<decltype(num_categories)>(340), num_categories);
   size_t constexpr kBins = 256;
   auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories);
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
@@ -340,6 +340,11 @@ TEST(Histogram, GPUHistCategorical) {
   for (size_t num_categories = 2; num_categories < 8; ++num_categories) {
     TestGPUHistogramCategorical(num_categories);
   }
+  // Larger than the shared memory size, must use global memory since there's no feature
+  // group with a single feature.
+  auto max_shmem = dh::MaxSharedMemoryOptin(0);
+  auto n_categories = common::DivRoundUp(max_shmem, sizeof(GradientPairInt64)) * 2;
+  TestGPUHistogramCategorical(n_categories);
 }
 
 namespace {

From 4ff48d9212d877a44710e85a18754aad20d7c551 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 22 Jul 2025 16:40:44 +0800
Subject: [PATCH 109/224] Use string view when appropriate. (#11567)

---
 src/common/common.h             | 64 ++++++++++++++++-----------------
 src/common/cuda_dr_utils.cc     | 17 ++++-----
 src/common/quantile.cuh         |  5 +--
 tests/cpp/common/test_common.cc | 51 +++++++++++++++++++++++++-
 4 files changed, 93 insertions(+), 44 deletions(-)

diff --git a/src/common/common.h b/src/common/common.h
index ddc36ab657fe..f4e6a697e742 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -6,15 +6,13 @@
 #ifndef XGBOOST_COMMON_COMMON_H_
 #define XGBOOST_COMMON_COMMON_H_
 
-#include <array>      // for array
-#include <cmath>      // for ceil
-#include <cstddef>    // for size_t
-#include <cstdint>    // for int32_t, int64_t
-#include <sstream>    // for basic_istream, operator<<, istringstream
-#include <string>     // for string, basic_string, getline, char_traits
-#include <tuple>      // for make_tuple
-#include <utility>    // for forward, index_sequence, make_index_sequence
-#include <vector>     // for vector
+#include <cmath>        // for ceil
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t, int64_t
+#include <sstream>      // for istringstream
+#include <string>       // for string, basic_string, getline, char_traits
+#include <string_view>  // for string_view
+#include <vector>       // for vector
 
 #include "xgboost/base.h"     // for XGBOOST_DEVICE
 #include "xgboost/logging.h"  // for LOG, LOG_FATAL, LogMessageFatal
@@ -52,9 +50,9 @@ namespace xgboost::common {
  * \param s String to be split.
  * \param delim The delimiter.
  */
-inline std::vector<std::string> Split(const std::string& s, char delim) {
+[[nodiscard]] inline std::vector<std::string> Split(std::string const &s, char delim) {
   std::string item;
-  std::istringstream is(s);
+  std::istringstream is{s};
   std::vector<std::string> ret;
   while (std::getline(is, item, delim)) {
     ret.push_back(item);
@@ -62,26 +60,39 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
   return ret;
 }
 
+[[nodiscard]] inline std::vector<std::string_view> Split(std::string_view s, char delim) {
+  std::size_t cur = 0;
+  std::vector<std::string_view> ret;
+  while ((cur = s.find_first_of(delim)) != std::string_view::npos) {
+    auto segment = s.substr(0, cur);
+    ret.push_back(segment);
+    s = s.substr(cur + 1);
+  }
+  if (!s.empty()) {
+    ret.push_back(s);
+  }
+  return ret;
+}
+
 // Trims leading whitespace from a string
-[[nodiscard]] inline std::string TrimFirst(const std::string &str) {
+[[nodiscard]] inline std::string_view TrimFirst(std::string_view const &str) {
   if (str.empty()) {
     return str;
   }
-
-  std::size_t first = str.find_first_not_of(" \t\n\r");
-  if (first == std::string::npos) {
-    return "";
+  auto first = str.find_first_not_of(" \t\n\r");
+  if (first == std::string_view::npos) {
+    return {};
   }
   return str.substr(first);
 }
 
-[[nodiscard]] inline std::string TrimLast(std::string const &str) {
+[[nodiscard]] inline std::string_view TrimLast(std::string_view const &str) {
   if (str.empty()) {
     return str;
   }
-  std::size_t last = str.find_last_not_of(" \t\n\r");
-  if (last == std::string::npos) {
-    return "";
+  auto last = str.find_last_not_of(" \t\n\r");
+  if (last == std::string_view::npos) {
+    return {};
   }
   return str.substr(0, last + 1);
 }
@@ -110,19 +121,6 @@ XGBOOST_DEVICE T1 DivRoundUp(const T1 a, const T2 b) {
   return static_cast<T1>(std::ceil(static_cast<double>(a) / b));
 }
 
-namespace detail {
-template <class T, std::size_t N, std::size_t... Idx>
-constexpr auto UnpackArr(std::array<T, N> &&arr, std::index_sequence<Idx...>) {
-  return std::make_tuple(std::forward<std::array<T, N>>(arr)[Idx]...);
-}
-}  // namespace detail
-
-template <class T, std::size_t N>
-constexpr auto UnpackArr(std::array<T, N> &&arr) {
-  return detail::UnpackArr(std::forward<std::array<T, N>>(arr),
-                           std::make_index_sequence<N>{});
-}
-
 /*
  * Range iterator
  */
diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index c2f7cb7d967f..421662e61448 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -5,6 +5,7 @@
 #include "cuda_dr_utils.h"
 
 #include <algorithm>  // for max
+#include <charconv>   // for from_chars
 #include <cstdint>    // for int32_t
 #include <cstring>    // for memset
 #include <memory>     // for make_unique
@@ -162,15 +163,15 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
   if (smi_ver.size() != 2 && smi_ver.size() != 3) {
     return Invalid();
   }
-  try {
-    *p_major = std::stoi(smi_ver[0]);
-    *p_minor = std::stoi(smi_ver[1]);
-    LOG(INFO) << "Driver version: `" << *p_major << "." << *p_minor << "`";
-    return true;
-  } catch (std::exception const &) {
-  }
 
-  return Invalid();
+  auto [smajor, sminor] = std::tie(smi_ver[0], smi_ver[1]);
+  auto ret0 = std::from_chars(smajor.data(), smajor.data() + smajor.size(), *p_major);
+  auto ret1 = std::from_chars(sminor.data(), sminor.data() + sminor.size(), *p_minor);
+  if (ret0.ec != std::errc{} || ret1.ec != std::errc{}) {
+    return Invalid();
+  }
+  LOG(INFO) << "Driver version: `" << *p_major << "." << *p_minor << "`";
+  return true;
 }
 
 [[nodiscard]] bool GetVersionFromSmiGlobal(std::int32_t *p_major, std::int32_t *p_minor) {
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 1b60670d0d68..44f4f6a3a5ad 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_QUANTILE_CUH_
 #define XGBOOST_COMMON_QUANTILE_CUH_
@@ -7,6 +7,7 @@
 #include <thrust/logical.h>  // for any_of
 
 #include "categorical.h"
+#include "common.h"          // for HumanMemUnit
 #include "cuda_context.cuh"  // for CUDAContext
 #include "cuda_rt_utils.h"   // for SetDevice
 #include "device_helpers.cuh"
@@ -181,7 +182,7 @@ class SketchContainer {
     this->Current().shrink_to_fit();
     this->Other().clear();
     this->Other().shrink_to_fit();
-    LOG(DEBUG) << "Quantile memory cost:" << this->MemCapacityBytes();
+    LOG(DEBUG) << "Quantile memory cost:" << common::HumanMemUnit(this->MemCapacityBytes());
   }
 
   /* \brief Merge quantiles from other GPU workers. */
diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc
index c615d1843cbb..eff90d5ffa93 100644
--- a/tests/cpp/common/test_common.cc
+++ b/tests/cpp/common/test_common.cc
@@ -3,6 +3,11 @@
  */
 #include <gtest/gtest.h>
 
+#include <algorithm>    // for equal
+#include <cstddef>      // for size_t
+#include <string>       // for string
+#include <string_view>  // for string_view
+
 #include "../../../src/common/common.h"
 
 namespace xgboost::common {
@@ -17,7 +22,8 @@ TEST(Common, HumanMemUnit) {
   ASSERT_EQ(name, "1B");
 }
 
-TEST(Common, TrimLast) {
+TEST(Common, Trim) {
+  // string
   {
     std::string in{"foobar "};
     auto out = TrimLast(in);
@@ -29,5 +35,48 @@ TEST(Common, TrimLast) {
     auto out = TrimLast(in);
     ASSERT_EQ(out, "foobar");
   }
+  // string view
+  {
+    auto res = TrimFirst(" foo ");
+    ASSERT_EQ(res, std::string_view{"foo "});
+  }
+  {
+    auto res = TrimLast(" foo ");
+    ASSERT_EQ(res, std::string_view{" foo"});
+  }
+  {
+    auto res = TrimLast("  ");
+    ASSERT_EQ(res, std::string_view{});
+  }
+  {
+    auto res = TrimFirst("  ");
+    ASSERT_EQ(res, std::string_view{});
+  }
+  {
+    auto res = TrimFirst("");
+    ASSERT_EQ(res, std::string_view{});
+  }
+}
+
+TEST(Common, Split) {
+  auto check = [](char const* chars, std::size_t n) {
+    std::string str{chars};
+    auto res_str = Split(str, ',');
+    std::string_view view{chars};
+    auto res_view = Split(view, ',');
+    ASSERT_EQ(res_view.size(), res_str.size());
+    ASSERT_EQ(res_view.size(), n);
+    for (std::size_t i = 0; i < res_str.size(); ++i) {
+      ASSERT_EQ(res_str[i].size(), res_view[i].size());
+      auto eq = std::equal(res_str[i].cbegin(), res_str[i].cend(), res_view[i].cbegin());
+      ASSERT_TRUE(eq);
+    }
+  };
+  check("foo,bar", 2);
+  check("foo,bar,", 2);
+  check(",foo,bar", 3);
+  check(",foo,bar,", 3);  // last is ignored
+  check(",,,,foo,bar", 6);
+  check(",foo,,,,bar", 6);
 }
 }  // namespace xgboost::common

From 83f8e7c0885009dba1e103e16fc11b1d9004d3e7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 24 Jul 2025 01:50:30 +0800
Subject: [PATCH 110/224] Remove compute capability check. (#11585)

---
 src/common/cuda_rt_utils.cc  | 12 ------------
 src/common/cuda_rt_utils.h   |  2 --
 src/tree/updater_gpu_hist.cu |  4 +---
 tests/python/test_demos.py   |  1 +
 4 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
index 429a1444a108..ad70d6ebd811 100644
--- a/src/common/cuda_rt_utils.cc
+++ b/src/common/cuda_rt_utils.cc
@@ -55,18 +55,6 @@ std::int32_t CurrentDevice(bool raise) {
   return res == 1;
 }
 
-void CheckComputeCapability() {
-  for (std::int32_t d_idx = 0; d_idx < AllVisibleGPUs(); ++d_idx) {
-    cudaDeviceProp prop;
-    dh::safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
-    std::ostringstream oss;
-    oss << "CUDA Capability Major/Minor version number: " << prop.major << "." << prop.minor
-        << " is insufficient.  Need >=3.5";
-    int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5);
-    if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
-  }
-}
-
 void SetDevice(std::int32_t device) {
   if (device >= 0) {
     dh::safe_cuda(cudaSetDevice(device));
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
index df476521497a..b5211b2f7f4e 100644
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@@ -20,8 +20,6 @@ std::int32_t CurrentDevice(bool raise = true);
 // Address Translation Service (ATS)
 [[nodiscard]] bool SupportsAts();
 
-void CheckComputeCapability();
-
 void SetDevice(std::int32_t device);
 
 /**
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 0a9793abec93..bec01f6ef5fc 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -14,7 +14,7 @@
 #include "../collective/aggregator.h"
 #include "../common/categorical.h"     // for KCatBitField
 #include "../common/cuda_context.cuh"  // for CUDAContext
-#include "../common/cuda_rt_utils.h"   // for CheckComputeCapability
+#include "../common/cuda_rt_utils.h"   // for SetDevice
 #include "../common/device_helpers.cuh"
 #include "../common/device_vector.cuh"  // for device_vector
 #include "../common/hist_util.h"        // for HistogramCuts
@@ -840,7 +840,6 @@ class GPUHistMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Hist]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
-    curt::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init("updater_gpu_hist");
@@ -958,7 +957,6 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Approx]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
-    curt::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init(this->Name());
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 65e56dabf6c3..0c75e0f59279 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -67,6 +67,7 @@ def test_sklearn_demo() -> None:
 
 
 @pytest.mark.skipif(**tm.no_sklearn())
+@pytest.mark.timeout(60)
 def test_sklearn_parallel_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "sklearn_parallel.py")
     cmd = ["python", script]

From c131a1bb0471f53fd5cd088577b478ba50c72bd7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 24 Jul 2025 13:19:20 +0800
Subject: [PATCH 111/224] [EM] More rigorous check for potential cross socket
 access. (#11576)

---
 src/common/cuda_dr_utils.cc                | 10 +------
 src/common/cuda_dr_utils.h                 |  5 ----
 src/common/cuda_rt_utils.cc                |  4 +--
 src/common/cuda_rt_utils.h                 |  4 +--
 src/common/device_compression.cu           |  1 +
 src/common/device_helpers.cu               |  2 +-
 src/common/numa_topo.cc                    | 30 ++++++++++++++++++++
 src/common/numa_topo.h                     | 28 +++++++++++++++++++
 src/common/threading_utils.cc              |  8 ------
 src/common/threading_utils.h               |  6 ----
 src/data/ellpack_page_source.cu            | 32 ++++++++++++++++++++--
 src/data/ellpack_page_source.h             | 12 ++++----
 tests/cpp/common/test_device_vector.cu     |  2 +-
 tests/cpp/common/test_numa_topo.cc         | 20 ++++++++++++++
 tests/cpp/data/test_sparse_page_dmatrix.cc |  5 ++--
 15 files changed, 126 insertions(+), 43 deletions(-)

diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index 421662e61448..82da96ea0864 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -87,7 +87,7 @@ void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line
 
 [[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
   std::int32_t cu_major = -1, cu_minor = -1;
-  GetDrVersionGlobal(&cu_major, &cu_minor);
+  curt::GetDrVersionGlobal(&cu_major, &cu_minor);
 
   std::int32_t kdm_major = -1, kdm_minor = -1;
   if (!GetVersionFromSmiGlobal(&kdm_major, &kdm_minor)) {
@@ -185,14 +185,6 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
   return result;
 }
 
-void GetDrVersionGlobal(std::int32_t *p_major, std::int32_t *p_minor) {
-  static std::once_flag once;
-  static std::int32_t major{0}, minor{0};
-  std::call_once(once, [] { xgboost::curt::DrVersion(&major, &minor); });
-  *p_major = major;
-  *p_minor = minor;
-}
-
 namespace detail {
 // Split up an impl function for simple tests.
 [[nodiscard]] std::int32_t GetC2cLinkCountFromSmiImpl(std::string const &smi_output) {
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
index 344befa27620..5cc1530f1639 100644
--- a/src/common/cuda_dr_utils.h
+++ b/src/common/cuda_dr_utils.h
@@ -131,11 +131,6 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc);
  */
 [[nodiscard]] bool GetVersionFromSmiGlobal(std::int32_t *p_major, std::int32_t *p_minor);
 
-/**
- * @brief Cache the result from @ref DrVersion in a global variable
- */
-void GetDrVersionGlobal(std::int32_t *p_major, std::int32_t *p_minor);
-
 namespace detail {
 [[nodiscard]] std::int32_t GetC2cLinkCountFromSmiImpl(std::string const &smi_output);
 }  // namespace detail
diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
index ad70d6ebd811..ae6249da372f 100644
--- a/src/common/cuda_rt_utils.cc
+++ b/src/common/cuda_rt_utils.cc
@@ -83,12 +83,12 @@ void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
 }
 }  // namespace
 
-void RtVersion(std::int32_t* major, std::int32_t* minor) {
+void GetRtVersionGlobal(std::int32_t* major, std::int32_t* minor) {
   GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major,
                  minor);
 }
 
-void DrVersion(std::int32_t* major, std::int32_t* minor) {
+void GetDrVersionGlobal(std::int32_t* major, std::int32_t* minor) {
   GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
 }
 
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
index b5211b2f7f4e..6fa747ecedf1 100644
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@@ -28,10 +28,10 @@ void SetDevice(std::int32_t device);
 [[nodiscard]] std::size_t TotalMemory();
 
 // Returns the CUDA Runtime version.
-void RtVersion(std::int32_t* major, std::int32_t* minor);
+void GetRtVersionGlobal(std::int32_t* major, std::int32_t* minor);
 
 // Returns the latest version of CUDA supported by the driver.
-void DrVersion(std::int32_t* major, std::int32_t* minor);
+void GetDrVersionGlobal(std::int32_t* major, std::int32_t* minor);
 
 // Get the current device's numa ID.
 [[nodiscard]] std::int32_t GetNumaId();
diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu
index 15d04a24d7e1..3f3997221a81 100644
--- a/src/common/device_compression.cu
+++ b/src/common/device_compression.cu
@@ -421,6 +421,7 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
   }
   // copy from device buffer to the host cache.
   CHECK_EQ(n_total_bytes, in_buf.size());
+  CHECK(pool);
   auto c_page =
       common::MakeFixedVecWithPinnedMemPool<std::remove_reference_t<decltype(in_buf)>::value_type>(
           pool, n_total_act_bytes, stream);
diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu
index 94c1c9d8ce59..1d29f0d48e13 100644
--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@@ -16,7 +16,7 @@ namespace {
 // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable
 [[nodiscard]] bool CheckVmAlloc() {
   std::int32_t major{0}, minor{0};
-  xgboost::cudr::GetDrVersionGlobal(&major, &minor);
+  xgboost::curt::GetDrVersionGlobal(&major, &minor);
 
   bool vm_flag = true;
   if (IsSupportedDrVer(major, minor)) {
diff --git a/src/common/numa_topo.cc b/src/common/numa_topo.cc
index 755364623669..f4b8b0ebe54b 100644
--- a/src/common/numa_topo.cc
+++ b/src/common/numa_topo.cc
@@ -188,4 +188,34 @@ void GetNumaNodeCpus(std::int32_t node_id, std::vector<std::int32_t> *p_cpus) {
 #endif  // defined(__linux__)
   return -1;
 }
+
+void GetNumaHasNormalMemoryNodes(std::vector<std::int32_t> *p_nodes) {
+#if defined(__linux__)
+  fs::path has_nm{"/sys/devices/system/node/has_normal_memory"};
+  p_nodes->clear();
+  if (!fs::exists(has_nm)) {
+    return;
+  }
+  ReadCpuList(has_nm, p_nodes);
+#endif  // defined(__linux__)
+}
+
+void GetNumaHasCpuNodes(std::vector<std::int32_t> *p_nodes) {
+#if defined(__linux__)
+  fs::path has_cpu{"/sys/devices/system/node/has_cpu"};
+  p_nodes->clear();
+  if (!fs::exists(has_cpu)) {
+    return;
+  }
+  ReadCpuList(has_cpu, p_nodes);
+#endif  // defined(__linux__)
+}
+
+[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) {
+#ifdef SYS_getcpu
+  return syscall(SYS_getcpu, cpu, numa, NULL) == 0;
+#else
+  return false;
+#endif
+}
 }  // namespace xgboost::common
diff --git a/src/common/numa_topo.h b/src/common/numa_topo.h
index 1ab586c12e7e..3aedbe42e57a 100644
--- a/src/common/numa_topo.h
+++ b/src/common/numa_topo.h
@@ -50,4 +50,32 @@ void GetNumaNodeCpus(std::int32_t node_id, std::vector<std::int32_t> *p_cpus);
  * @return -1 if there's no NUMA node. Otherwise, returns the number of NUMA nodes.
  */
 [[nodiscard]] std::int32_t GetNumaNumNodes();
+
+/**
+ * @brief Read the `has_normal_memory` system file.
+ */
+void GetNumaHasNormalMemoryNodes(std::vector<std::int32_t> *p_nodes);
+
+/**
+ * @brief Read the `has_cpu` system file.
+ */
+void GetNumaHasCpuNodes(std::vector<std::int32_t> *p_nodes);
+
+/**
+ * @brief Get numa node on Linux. Other platforms are not supported. Returns false if the
+ *        call fails.
+ */
+[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa);
+
+/**
+ * @brief Is it physically possible to access the wrong memory?
+ */
+[[nodiscard]] inline bool NumaMemCanCross() {
+  std::vector<std::int32_t> nodes;
+  GetNumaHasCpuNodes(&nodes);
+  bool result = nodes.size() > 1;
+  GetNumaHasNormalMemoryNodes(&nodes);
+  result &= nodes.size() > 1;
+  return result;
+}
 }  // namespace xgboost::common
diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc
index 430d5aaae6e1..16afe72a0898 100644
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -121,14 +121,6 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) noexcept(true) {
   return n_threads;
 }
 
-[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) {
-#ifdef SYS_getcpu
-  return syscall(SYS_getcpu, cpu, numa, NULL) == 0;
-#else
-  return false;
-#endif
-}
-
 void NameThread(std::thread* t, StringView name) {
 #if defined(__linux__) && (!defined(__ANDROID__) || __ANDROID_API__ >= 26)
   auto handle = t->native_handle();
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index f8d57a22207e..26c0925bc579 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -319,12 +319,6 @@ class MemStackAllocator {
  */
 std::int32_t constexpr DefaultMaxThreads() { return 128; }
 
-/**
- * @brief Get numa node on Linux. Other platforms are not supported. Returns false if the
- *        call fails.
- */
-[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa);
-
 /**
  * @brief Give the thread a name. Supports only pthread on linux.
  */
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index f40e27760982..af422a37683e 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,10 +10,11 @@
 
 #include "../common/common.h"                // for HumanMemUnit, safe_cuda
 #include "../common/cuda_dr_utils.h"         // for CUDA_HW_DECOM_AVAILABLE
-#include "../common/cuda_rt_utils.h"         // for SetDevice
+#include "../common/cuda_rt_utils.h"         // for SetDevice, GetDrVersionGlobal
 #include "../common/cuda_stream_pool.cuh"    // for StreamPool
 #include "../common/device_compression.cuh"  // for CompressSnappy, MakeSnappyDecomprMgr
 #include "../common/device_helpers.cuh"      // for CUDAStreamView, DefaultStream
+#include "../common/numa_topo.h"             // for NumaMemCanCross, GetNumaMemBind
 #include "../common/ref_resource_view.cuh"   // for MakeFixedVecWithCudaMalloc
 #include "../common/resource.cuh"            // for PrivateCudaMmapConstStream
 #include "../common/transform_iterator.h"    // for MakeIndexTransformIter
@@ -52,7 +53,12 @@ EllpackMemCache::EllpackMemCache(EllpackCacheInfo cinfo, std::int32_t n_workers)
       streams{std::make_unique<curt::StreamPool>(n_workers)},
       pool{[] {
 #if defined(__linux__)
-        return std::make_shared<dc::HostPinnedMemPool>();
+        std::int32_t major = -1, minor = -1;
+        curt::GetDrVersionGlobal(&major, &minor);
+        if (major >= 12 && minor >= 5 || major > 12) {
+          return std::make_shared<dc::HostPinnedMemPool>();
+        }
+        return std::shared_ptr<dc::HostPinnedMemPool>{nullptr};
 #else
         return std::shared_ptr<dc::HostPinnedMemPool>{nullptr};
 #endif
@@ -596,4 +602,26 @@ template void
 ExtEllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
 template void
 ExtEllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
+
+namespace detail {
+void EllpackFormatCheckNuma(StringView msg) {
+#if defined(__linux__)
+  bool can_cross = common::NumaMemCanCross();
+  std::uint32_t numa = 0;
+  auto incorrect = [&numa] {
+    std::uint32_t cpu = 0;
+    return common::GetCpuNuma(&cpu, &numa) && static_cast<std::int32_t>(numa) != curt::GetNumaId();
+  };
+
+  if (can_cross && !common::GetNumaMemBind()) {
+    LOG(WARNING) << "Running on a NUMA system without membind." << msg;
+  } else if (can_cross && incorrect()) {
+    LOG(WARNING) << "Incorrect NUMA CPU bind, CPU node:" << numa
+                 << ", GPU node:" << curt::GetNumaId() << "." << msg;
+  }
+#else
+  (void)msg;
+#endif
+}
+}  // namespace detail
 }  // namespace xgboost::data
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 6d4079abbc85..d64f077e2b84 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -16,7 +16,6 @@
 #include "../common/cuda_rt_utils.h"        // for SupportsPageableMem, SupportsAts
 #include "../common/device_compression.h"   // for SnappyDecomprMgr
 #include "../common/hist_util.h"            // for HistogramCuts
-#include "../common/numa_topo.h"            // for GetNumaNumNodes, GetNumaMemBind
 #include "../common/ref_resource_view.h"    // for RefResourceView
 #include "../data/batch_utils.h"            // for AutoHostRatio
 #include "ellpack_page.h"                   // for EllpackPage
@@ -168,6 +167,11 @@ class EllpackHostCacheStream {
   [[nodiscard]] bool Write(EllpackPage const& page);
 };
 
+namespace detail {
+// Not a member of `EllpackFormatPolicy`. Hide the impl without requiring template specialization.
+void EllpackFormatCheckNuma(StringView msg);
+}  // namespace detail
+
 template <typename S>
 class EllpackFormatPolicy {
   std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
@@ -195,16 +199,14 @@ class EllpackFormatPolicy {
       LOG(WARNING) << "`use_rmm` is set to false." << msg;
     }
     std::int32_t major{0}, minor{0};
-    curt::DrVersion(&major, &minor);
+    curt::GetDrVersionGlobal(&major, &minor);
     if ((major < 12 || (major == 12 && minor < 7)) && curt::SupportsAts()) {
       // Use ATS, but with an old kernel driver.
       LOG(WARNING) << "Using an old kernel driver with supported CTK<12.7."
                    << "The latest version of CTK supported by the current driver: " << major << "."
                    << minor << "." << msg;
     }
-    if (common::GetNumaNumNodes() > 1 && !common::GetNumaMemBind()) {
-      LOG(WARNING) << "Running on a NUMA system without membind." << msg;
-    }
+    detail::EllpackFormatCheckNuma(msg);
   }
   // For testing with the HMM flag.
   explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index 6f4c34edfa9f..11a0d1b7125f 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -108,7 +108,7 @@ INSTANTIATE_TEST_SUITE_P(
 
 TEST(TestVirtualMem, Version) {
   std::int32_t major, minor;
-  xgboost::curt::DrVersion(&major, &minor);
+  xgboost::curt::GetDrVersionGlobal(&major, &minor);
   LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
   PinnedMemory pinned;
 #if defined(xgboost_IS_WIN)
diff --git a/tests/cpp/common/test_numa_topo.cc b/tests/cpp/common/test_numa_topo.cc
index 22f53ac20ce9..89980842832b 100644
--- a/tests/cpp/common/test_numa_topo.cc
+++ b/tests/cpp/common/test_numa_topo.cc
@@ -149,4 +149,24 @@ TEST(Numa, GetNumNodes) {
   ASSERT_EQ(n_nodes, -1);
 #endif  // defined(__linux__)
 }
+
+TEST(Numa, GetHasCpuNodes) {
+  std::vector<std::int32_t> nodes;
+  GetNumaHasCpuNodes(&nodes);
+#if defined(__linux__)
+  ASSERT_GE(nodes.size(), 1);
+#else
+  ASSERT_EQ(nodes.size(), 0);
+#endif  // defined(__linux__)
+}
+
+TEST(Numa, GetHasNormalMemoryNodes) {
+  std::vector<std::int32_t> nodes;
+  GetNumaHasNormalMemoryNodes(&nodes);
+#if defined(__linux__)
+  ASSERT_GE(nodes.size(), 1);
+#else
+  ASSERT_EQ(nodes.size(), 0);
+#endif  // defined(__linux__)
+}
 }  // namespace xgboost::common
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 74d0c814b681..27ef71b48275 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -5,8 +5,9 @@
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
 
-#include <future>
-#include <thread>
+#include <filesystem>  // for path
+#include <future>      // for future, async
+#include <thread>      // for sleep_for
 
 #include "../../../src/common/io.h"
 #include "../../../src/data/batch_utils.h"  // for MatchingPageBytes

From e17f16a522a8311a581badf937d49eaf47071562 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 24 Jul 2025 14:00:26 +0800
Subject: [PATCH 112/224] Optionally include git hash during build. (#11587)

---
 CMakeLists.txt     |  1 +
 cmake/Utils.cmake  | 17 +++++++++++++++++
 src/c_api/c_api.cc |  5 +++++
 3 files changed, 23 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b10fb5d258d1..22e8b219dd88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ option(USE_OPENMP "Build with OpenMP support." ON)
 option(BUILD_STATIC_LIB "Build static library" OFF)
 option(BUILD_DEPRECATED_CLI "Build the deprecated command line interface" OFF)
 option(FORCE_SHARED_CRT "Build with dynamic CRT on Windows (/MD)" OFF)
+option(BUILD_WITH_GIT_HASH "Add a short git hash to the build info." OFF)
 ## Bindings
 option(JVM_BINDINGS "Build JVM bindings" OFF)
 option(R_LIB "Build shared library for R package" OFF)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 180b33f15f12..0a0c6e38e170 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -43,6 +43,20 @@ function(set_default_configuration_release)
     endif()
 endfunction()
 
+if(BUILD_WITH_GIT_HASH)
+  execute_process(COMMAND git rev-parse --short HEAD
+    WORKING_DIRECTORY ${xgboost_SOURCE_DIR}
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE XGBOOST_GIT_HASH
+    ERROR_VARIABLE XGBOOST_GIT_ERROR
+    RESULT_VARIABLE GIT_COMMAND_RESULT)
+
+  if(NOT GIT_COMMAND_RESULT EQUAL 0)
+    message(FATAL_ERROR "Failed to retrieve the git hash:\n${XGBOOST_GIT_ERROR}")
+  endif()
+  message(STATUS "Git hash: ${XGBOOST_GIT_HASH}")
+endif()
+
 # Generate CMAKE_CUDA_ARCHITECTURES form a list of architectures
 # Also generates PTX for the most recent architecture for forwards compatibility
 function(compute_cmake_cuda_archs archs)
@@ -240,6 +254,9 @@ macro(xgboost_target_defs target)
   if(USE_NVCOMP)
     target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_NVCOMP=1)
   endif()
+  if(BUILD_WITH_GIT_HASH)
+    target_compile_definitions(objxgboost PUBLIC -DXGBOOST_GIT_HASH="${XGBOOST_GIT_HASH}")
+  endif()
 endmacro()
 
 # handles dependencies
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index a6ba4c693698..cd77be4a48cc 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -127,6 +127,11 @@ XGB_DLL int XGBuildInfo(char const **out) {
   info["USE_FEDERATED"] = Boolean{false};
 #endif
 
+#if defined(XGBOOST_GIT_HASH)
+  char const *git_hash = XGBOOST_GIT_HASH;
+  info["GIT_HASH"] = String{git_hash};
+#endif
+
   XGBBuildInfoDevice(&info);
 
   auto &out_str = GlobalConfigAPIThreadLocalStore::Get()->ret_str;

From aa497aeece1f6c06a8ac45d46028102a13dd2fff Mon Sep 17 00:00:00 2001
From: newyork_loki <137760120+lowkeyrossi@users.noreply.github.com>
Date: Sat, 26 Jul 2025 16:22:15 +0530
Subject: [PATCH 113/224] Add support for building xgboost wheels on Win-ARM64
 (#11572)

* Add support for building xgboost wheels on Win-ARM64

* Add CI support for building xgboost wheels for WoA
---
 .github/workflows/python_wheels_winarm64.yml | 65 ++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 .github/workflows/python_wheels_winarm64.yml

diff --git a/.github/workflows/python_wheels_winarm64.yml b/.github/workflows/python_wheels_winarm64.yml
new file mode 100644
index 000000000000..4d9b0270a95a
--- /dev/null
+++ b/.github/workflows/python_wheels_winarm64.yml
@@ -0,0 +1,65 @@
+name: Build Python wheels targeting Windows ARM64
+
+on: [push, pull_request]
+
+permissions:
+  contents: read  # to fetch code (actions/checkout)
+
+defaults:
+  run:
+    shell: pwsh
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  BRANCH_NAME: >-
+    ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+    
+jobs:
+  python-wheels-Win-ARM64:
+    name: Build wheel for Windows ARM64
+    runs-on: windows-11-arm
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'true'
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install build dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install wheel setuptools
+
+      - name: Build XGBoost for Win-ARM64
+        run: |
+          mkdir build
+          cd build
+          cmake .. -G"Visual Studio 17 2022" -A ARM64
+          cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal"
+          
+      - name: Build XGBoost Python wheel for Win-ARM64
+        run: |
+          # Patch to rename pkg to xgboost-cpu
+          python ops/script/pypi_variants.py --use-cpu-suffix=1 --require-nccl-dep=0
+          cd python-package
+          mkdir -p wheelhouse
+          pip wheel --no-deps -v . --wheel-dir wheelhouse/
+          $wheelFile = Get-ChildItem wheelhouse/*.whl | Select-Object -First 1 -ExpandProperty FullName
+          python -m wheel tags --python-tag py3 --abi-tag none --platform win_arm64 --remove $wheelFile
+
+      - name: Upload Python wheel
+        if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+        run: |
+          python ops/pipeline/manage-artifacts.py upload `
+            --s3-bucket xgboost-nightly-builds `
+            --prefix ${{ env.BRANCH_NAME }}/${{ github.sha }} --make-public `
+            python-package/wheelhouse/*.whl
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}

From 62b5b164af605ec530c07c97b32f2003800a1435 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 26 Jul 2025 19:27:13 +0800
Subject: [PATCH 114/224] [doc] Split up the support matrix from the intro.
 (#11586)

---
 doc/python/data_input.rst      |  86 ++++++++++++++++++++++
 doc/python/index.rst           |   1 +
 doc/python/python_intro.rst    | 129 +++------------------------------
 python-package/xgboost/core.py |  14 ++--
 4 files changed, 104 insertions(+), 126 deletions(-)
 create mode 100644 doc/python/data_input.rst

diff --git a/doc/python/data_input.rst b/doc/python/data_input.rst
new file mode 100644
index 000000000000..8343c1079e44
--- /dev/null
+++ b/doc/python/data_input.rst
@@ -0,0 +1,86 @@
+################################
+Supported Python data structures
+################################
+
+This page is a support matrix for various input types.
+
+.. _py-data:
+
+*******
+Markers
+*******
+
+- T: Supported.
+- F: Not supported.
+- NE: Invalid type for the use case. For instance, :py:class:`pandas.Series` can not be multi-target label.
+- NPA: Support with the help of numpy array.
+- AT: Support with the help of arrow table.
+- CPA: Support with the help of cupy array.
+- SciCSR: Support with the help of scipy sparse CSR :py:class:`scipy.sparse.csr_matrix`. The conversion to scipy CSR may or may not be possible. Raise a type error if conversion fails.
+- FF: We can look forward to having its support in recent future if requested.
+- empty: To be filled in.
+
+************
+Table Header
+************
+- `X` means predictor matrix.
+- Meta info: label, weight, etc.
+- Multi Label: 2-dim label for multi-target.
+- Others: Anything else that we don't list here explicitly including formats like `lil`, `dia`, `bsr`. XGBoost will try to convert it into scipy csr.
+
+**************
+Support Matrix
+**************
+
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| Name                    | DMatrix X | QuantileDMatrix X | Sklearn X | Meta Info | Inplace prediction | Multi Label |
++=========================+===========+===================+===========+===========+====================+=============+
+| numpy.ndarray           | T         | T                 | T         | T         | T                  | T           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| scipy.sparse.csr        | T         | T                 | T         | NE        | T                  | F           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| scipy.sparse.csc        | T         | F                 | T         | NE        | F                  | F           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| scipy.sparse.coo        | SciCSR    | F                 | SciCSR    | NE        | F                  | F           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| uri                     | T         | F                 | F         | F         | NE                 | F           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| list                    | NPA       | NPA               | NPA       | NPA       | NPA                | T           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| tuple                   | NPA       | NPA               | NPA       | NPA       | NPA                | T           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| pandas.DataFrame        | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| pandas.Series           | NPA       | NPA               | NPA       | NPA       | NPA                | NE          |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| cudf.DataFrame          | T         | T                 | T         | T         | T                  | T           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| cudf.Series             | T         | T                 | T         | T         | FF                 | NE          |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| cupy.ndarray            | T         | T                 | T         | T         | T                  | T           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| torch.Tensor            | T         | T                 | T         | T         | T                  | T           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| dlpack                  | CPA       | CPA               |           | CPA       | FF                 | FF          |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| modin.DataFrame         | NPA       | FF                | NPA       | NPA       | FF                 |             |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| modin.Series            | NPA       | FF                | NPA       | NPA       | FF                 |             |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| pyarrow.Table           | T         | T                 | T         | T         | T                  | T           |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| polars.DataFrame        | AT        | AT                | AT        | AT        | AT                 | AT          |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| polars.LazyFrame (WARN) | AT        | AT                | AT        | AT        | AT                 | AT          |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| polars.Series           | AT        | AT                | AT        | AT        | AT                 | NE          |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| Others                  | SciCSR    | F                 |           | F         | F                  |             |
++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+
+The polars ``LazyFrame.collect`` supports many configurations, ranging from the choice of
+query engine to type coercion. XGBoost simply uses the default parameter. Please run
+``collect`` to obtain the ``DataFrame`` before passing it into XGBoost for finer control
+over the behaviour.
\ No newline at end of file
diff --git a/doc/python/index.rst b/doc/python/index.rst
index ad557f96e101..dbae48b2ec3b 100644
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@@ -12,6 +12,7 @@ Contents
   python_intro
   sklearn_estimator
   python_api
+  data_input
   callbacks
   examples/index
   dask-examples/index
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index 5ded82922879..f9a458af4142 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -32,9 +32,9 @@ To verify your installation, run the following in Python:
 
 Data Interface
 --------------
-The XGBoost Python module is able to load data from many different types of data format including both CPU and GPU data structures. For a complete list of supported data types, please reference the :ref:`py-data`. For a detailed description of text input formats, please visit :doc:`/tutorials/input_format`.
+The XGBoost Python module is able to load data from many different types of data format including both CPU and GPU data structures. For a comprehensive list of supported data types, please reference the :doc:`/python/data_input`. For a detailed description of text input formats, please visit :doc:`/tutorials/input_format`.
 
-The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For the sklearn estimator interface, a :py:class:`DMatrix` or a :py:class:`QuantileDMatrix` is created depending on the chosen algorithm and the input, see the sklearn API reference for details. We will illustrate some of the basic input types with the ``DMatrix`` here.
+The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For the sklearn estimator interface, a :py:class:`DMatrix` or a :py:class:`QuantileDMatrix` is created depending on the chosen algorithm and the input, see the sklearn API reference for details. We will illustrate some of the basic input types using the ``DMatrix`` here.
 
 * To load a NumPy array into :py:class:`DMatrix <xgboost.DMatrix>`:
 
@@ -59,11 +59,12 @@ The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For
     label = pandas.DataFrame(np.random.randint(2, size=4))
     dtrain = xgb.DMatrix(data, label=label)
 
-* Saving :py:class:`DMatrix <xgboost.DMatrix>` into a XGBoost binary file will make loading faster:
+* Saving :py:class:`DMatrix <xgboost.DMatrix>` into a XGBoost binary file:
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
+    data = np.random.rand(5, 10)  # 5 entities, each contains 10 features
+    label = np.random.randint(2, size=5)  # binary target
     dtrain.save_binary('train.buffer')
 
 * Missing values can be replaced by a default value in the :py:class:`DMatrix <xgboost.DMatrix>` constructor:
@@ -79,116 +80,6 @@ The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For
     w = np.random.rand(5, 1)
     dtrain = xgb.DMatrix(data, label=label, missing=np.NaN, weight=w)
 
-When performing ranking tasks, the number of weights should be equal
-to number of groups.
-
-* To load a LIBSVM text file or a XGBoost binary file into :py:class:`DMatrix <xgboost.DMatrix>`:
-
-  .. code-block:: python
-
-    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
-    dtest = xgb.DMatrix('test.svm.buffer')
-
-  The parser in XGBoost has limited functionality. When using Python interface, it's
-  recommended to use sklearn ``load_svmlight_file`` or other similar utilites than
-  XGBoost's builtin parser.
-
-* To load a CSV file into :py:class:`DMatrix <xgboost.DMatrix>`:
-
-  .. code-block:: python
-
-    # label_column specifies the index of the column containing the true label
-    dtrain = xgb.DMatrix('train.csv?format=csv&label_column=0')
-    dtest = xgb.DMatrix('test.csv?format=csv&label_column=0')
-
-  The parser in XGBoost has limited functionality. When using Python interface, it's
-  recommended to use pandas ``read_csv`` or other similar utilites than XGBoost's builtin
-  parser.
-
-.. _py-data:
-
-Supported data structures for various XGBoost functions
-=======================================================
-
-*******
-Markers
-*******
-
-- T: Supported.
-- F: Not supported.
-- NE: Invalid type for the use case. For instance, `pd.Series` can not be multi-target label.
-- NPA: Support with the help of numpy array.
-- AT: Support with the help of arrow table.
-- CPA: Support with the help of cupy array.
-- SciCSR: Support with the help of scripy sparse CSR. The conversion to scipy CSR may or may not be possible. Raise a type error if conversion fails.
-- FF: We can look forward to having its support in recent future if requested.
-- empty: To be filled in.
-
-************
-Table Header
-************
-- `X` means predictor matrix.
-- Meta info: label, weight, etc.
-- Multi Label: 2-dim label for multi-target.
-- Others: Anything else that we don't list here explicitly including formats like `lil`, `dia`, `bsr`. XGBoost will try to convert it into scipy csr.
-
-**************
-Support Matrix
-**************
-
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| Name                    | DMatrix X | QuantileDMatrix X | Sklearn X | Meta Info | Inplace prediction | Multi Label |
-+=========================+===========+===================+===========+===========+====================+=============+
-| numpy.ndarray           | T         | T                 | T         | T         | T                  | T           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| scipy.sparse.csr        | T         | T                 | T         | NE        | T                  | F           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| scipy.sparse.csc        | T         | F                 | T         | NE        | F                  | F           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| scipy.sparse.coo        | SciCSR    | F                 | SciCSR    | NE        | F                  | F           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| uri                     | T         | F                 | F         | F         | NE                 | F           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| list                    | NPA       | NPA               | NPA       | NPA       | NPA                | T           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| tuple                   | NPA       | NPA               | NPA       | NPA       | NPA                | T           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| pandas.DataFrame        | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| pandas.Series           | NPA       | NPA               | NPA       | NPA       | NPA                | NE          |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| cudf.DataFrame          | T         | T                 | T         | T         | T                  | T           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| cudf.Series             | T         | T                 | T         | T         | FF                 | NE          |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| cupy.ndarray            | T         | T                 | T         | T         | T                  | T           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| torch.Tensor            | T         | T                 | T         | T         | T                  | T           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| dlpack                  | CPA       | CPA               |           | CPA       | FF                 | FF          |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| modin.DataFrame         | NPA       | FF                | NPA       | NPA       | FF                 |             |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| modin.Series            | NPA       | FF                | NPA       | NPA       | FF                 |             |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| pyarrow.Table           | T         | T                 | T         | T         | T                  | T           |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| polars.DataFrame        | AT        | AT                | AT        | AT        | AT                 | AT          |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| polars.LazyFrame (WARN) | AT        | AT                | AT        | AT        | AT                 | AT          |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| polars.Series           | AT        | AT                | AT        | AT        | AT                 | NE          |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| Others                  | SciCSR    | F                 |           | F         | F                  |             |
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-
-The polars ``LazyFrame.collect`` supports many configurations, ranging from the choice of
-query engine to type coercion. XGBoost simply uses the default parameter. Please run
-``collect`` to obtain the ``DataFrame`` before passing it into XGBoost for finer control
-over the behaviour.
-
 Setting Parameters
 ------------------
 XGBoost can use either a list of pairs or a dictionary to set :doc:`parameters </parameter>`. For instance:
@@ -227,11 +118,11 @@ Training a model requires a parameter list and data set.
   num_round = 10
   bst = xgb.train(param, dtrain, num_round, evallist)
 
-After training, the model can be saved.
+After training, the model can be saved into ``JSON`` or ``UBJSON``:
 
 .. code-block:: python
 
-  bst.save_model('0001.model')
+  bst.save_model('model.ubj')
 
 The model and its feature map can also be dumped to a text file.
 
@@ -247,10 +138,10 @@ A saved model can be loaded as follows:
 .. code-block:: python
 
   bst = xgb.Booster({'nthread': 4})  # init model
-  bst.load_model('model.bin')  # load model data
+  bst.load_model('model.ubj')  # load model data
 
-Methods including `update` and `boost` from `xgboost.Booster` are designed for
-internal usage only.  The wrapper function `xgboost.train` does some
+Methods including `update` and `boost` from :py:class:`xgboost.Booster` are designed for
+internal usage only.  The wrapper function :py:class:`xgboost.train` does some
 pre-configuration including setting up caches and some other parameters.
 
 Early Stopping
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 9bc80713daf4..e6560d20f4a1 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -953,9 +953,9 @@ def __init__(
             :doc:`/tutorials/categorical` for more info.
 
             If passing `True` and `data` is a data frame (from supported libraries such
-            as Pandas, Modin or cuDF), The DMatrix recognizes categorical columns and
-            automatically set the `feature_types` parameter. If `data` is not a data
-            frame, this argument is ignored.
+            as Pandas, Modin, polars, and cuDF), The DMatrix recognizes categorical
+            columns and automatically set the `feature_types` parameter. If `data` is
+            not a data frame, this argument is ignored.
 
             If passing `False` and `data` is a data frame with categorical columns, it
             will result in an error.
@@ -1660,12 +1660,12 @@ class QuantileDMatrix(DMatrix, _RefMixIn):
 
     max_quantile_batches :
         For GPU-based inputs from an iterator, XGBoost handles incoming batches with
-        multiple growing substreams. This parameter sets the maximum number of batches
-        before XGBoost can cut the sub-stream and create a new one. This can help bound
+        multiple growing sub-streams. This parameter sets the maximum number of batches
+        before XGBoost can cut a sub-stream and create a new one. This can help bound
         the memory usage. By default, XGBoost grows a sub-stream exponentially until
         batches are exhausted. This option is only used for the training dataset and the
-        default is None (unbounded). Lastly, if the `data` is a single batch instead of an
-        iterator, this parameter has no effect.
+        default is None (unbounded). Lastly, if the `data` is a single batch instead of
+        an iterator, this parameter has no effect.
 
         .. versionadded:: 3.0.0
 

From 45da69322073e13ccbab041a0d5f1fd55356f22f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 27 Jul 2025 03:10:53 +0800
Subject: [PATCH 115/224] Avoid using mean intercept for rmsle. (#11588)

---
 src/common/pseudo_huber.cc       |   2 +-
 src/common/pseudo_huber.h        |   4 +-
 src/metric/elementwise_metric.cu |   2 +-
 src/objective/quantile_obj.cu    |   4 +-
 src/objective/regression_loss.h  |   3 +-
 src/objective/regression_obj.cu  | 134 ++++++++++++++++++++-----------
 6 files changed, 95 insertions(+), 54 deletions(-)

diff --git a/src/common/pseudo_huber.cc b/src/common/pseudo_huber.cc
index 5f58a18b3ccb..c2799eb5096e 100644
--- a/src/common/pseudo_huber.cc
+++ b/src/common/pseudo_huber.cc
@@ -3,5 +3,5 @@
  */
 #include "pseudo_huber.h"
 namespace xgboost {
-DMLC_REGISTER_PARAMETER(PesudoHuberParam);
+DMLC_REGISTER_PARAMETER(PseudoHuberParam);
 }
diff --git a/src/common/pseudo_huber.h b/src/common/pseudo_huber.h
index 9cf604534806..0c0863385eb4 100644
--- a/src/common/pseudo_huber.h
+++ b/src/common/pseudo_huber.h
@@ -6,10 +6,10 @@
 #include "xgboost/parameter.h"
 
 namespace xgboost {
-struct PesudoHuberParam : public XGBoostParameter<PesudoHuberParam> {
+struct PseudoHuberParam : public XGBoostParameter<PseudoHuberParam> {
   float huber_slope{1.0};
 
-  DMLC_DECLARE_PARAMETER(PesudoHuberParam) {
+  DMLC_DECLARE_PARAMETER(PseudoHuberParam) {
     DMLC_DECLARE_FIELD(huber_slope)
         .set_default(1.0f)
         .describe("The delta term in Pseudo-Huber loss.");
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index b66558db670b..6526a2f589eb 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -182,7 +182,7 @@ struct EvalRowLogLoss {
 };
 
 class PseudoErrorLoss : public MetricNoCache {
-  PesudoHuberParam param_;
+  PseudoHuberParam param_;
 
  public:
   const char* Name() const override { return "mphe"; }
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index c88b3e836c71..06f33b6c5cfc 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #include <array>                            // std::array
 #include <cstddef>                          // std::size_t
@@ -10,7 +10,6 @@
 #include "../common/quantile_loss_utils.h"  // QuantileLossParam
 #include "../common/stats.h"                // Quantile,WeightedQuantile
 #include "adaptive.h"                       // UpdateTreeLeaf
-#include "dmlc/parameter.h"                 // DMLC_DECLARE_PARAMETER
 #include "init_estimation.h"                // CheckInitInputs
 #include "xgboost/base.h"                   // GradientPair,XGBOOST_DEVICE,bst_target_t
 #include "xgboost/data.h"                   // MetaInfo
@@ -18,7 +17,6 @@
 #include "xgboost/json.h"                   // Json,String,ToJson,FromJson
 #include "xgboost/linalg.h"                 // Tensor,MakeTensorView,MakeVec
 #include "xgboost/objective.h"              // ObjFunction
-#include "xgboost/parameter.h"              // XGBoostParameter
 
 #if defined(XGBOOST_USE_CUDA)
 
diff --git a/src/objective/regression_loss.h b/src/objective/regression_loss.h
index d2710d35a65e..417f06ad252d 100644
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 #define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
@@ -9,7 +9,6 @@
 #include <cmath>
 
 #include "../common/math.h"
-#include "xgboost/data.h"  // MetaInfo
 #include "xgboost/logging.h"
 #include "xgboost/task.h"  // ObjInfo
 
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index f74602475292..a2d903c8a951 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2024, XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file regression_obj.cu
  * \brief Definition of single-value regression and classification objectives.
  * \author Tianqi Chen, Kailong Chen
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdint>  // std::int32_t
-#include <memory>
 #include <vector>
 
 #include "../common/common.h"
@@ -53,56 +52,56 @@ void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& pre
   CheckInitInputs(info);
   CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
 }
+
+template <typename Loss>
+void ValidateLabel(Context const* ctx, MetaInfo const& info) {
+  auto label = info.labels.View(ctx->Device());
+  auto valid = ctx->DispatchDevice(
+      [&] {
+        return std::all_of(linalg::cbegin(label), linalg::cend(label),
+                           [](float y) -> bool { return Loss::CheckLabel(y); });
+      },
+      [&] {
+#if defined(XGBOOST_USE_CUDA)
+        auto cuctx = ctx->CUDACtx();
+        auto it = dh::MakeTransformIterator<bool>(
+            thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
+              auto [m, n] = linalg::UnravelIndex(i, label.Shape());
+              return Loss::CheckLabel(label(m, n));
+            });
+        return dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
+#else
+        common::AssertGPUSupport();
+        return false;
+#endif  // defined(XGBOOST_USE_CUDA)
+      },
+      [&] {
+#if defined(XGBOOST_USE_SYCL)
+        return sycl::linalg::Validate(ctx->Device(), label,
+                                      [](float y) -> bool { return Loss::CheckLabel(y); });
+#else
+        common::AssertSYCLSupport();
+        return false;
+#endif  // defined(XGBOOST_USE_SYCL)
+      });
+  if (!valid) {
+    LOG(FATAL) << Loss::LabelErrorMsg();
+  }
+}
 }  // anonymous namespace
 
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(regression_obj_gpu);
 #endif  // defined(XGBOOST_USE_CUDA)
 
-
-
 template<typename Loss>
 class RegLossObj : public FitInterceptGlmLike {
  protected:
   HostDeviceVector<float> additional_input_;
 
  public:
-  void ValidateLabel(MetaInfo const& info) {
-    auto label = info.labels.View(ctx_->Device());
-    auto valid = ctx_->DispatchDevice(
-        [&] {
-          return std::all_of(linalg::cbegin(label), linalg::cend(label),
-                             [](float y) -> bool { return Loss::CheckLabel(y); });
-        },
-        [&] {
-#if defined(XGBOOST_USE_CUDA)
-          auto cuctx = ctx_->CUDACtx();
-          auto it = dh::MakeTransformIterator<bool>(
-              thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
-                auto [m, n] = linalg::UnravelIndex(i, label.Shape());
-                return Loss::CheckLabel(label(m, n));
-              });
-          return dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
-#else
-          common::AssertGPUSupport();
-          return false;
-#endif  // defined(XGBOOST_USE_CUDA)
-        },
-        [&] {
-#if defined(XGBOOST_USE_SYCL)
-          return sycl::linalg::Validate(ctx_->Device(), label,
-                                        [](float y) -> bool { return Loss::CheckLabel(y); });
-#else
-          common::AssertSYCLSupport();
-          return false;
-#endif  // defined(XGBOOST_USE_SYCL)
-        });
-    if (!valid) {
-      LOG(FATAL) << Loss::LabelErrorMsg();
-    }
-  }
   // 0 - scale_pos_weight, 1 - is_null_weight
-  RegLossObj(): additional_input_(2) {}
+  RegLossObj() : additional_input_(2) {}
 
   void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
 
@@ -117,7 +116,7 @@ class RegLossObj : public FitInterceptGlmLike {
                    std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override {
     CheckRegInputs(info, preds);
     if (iter == 0) {
-      ValidateLabel(info);
+      ValidateLabel<Loss>(this->ctx_, info);
     }
 
     size_t const ndata = preds.Size();
@@ -224,10 +223,6 @@ XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegression, LinearSquareLoss::Name())
 .describe("Regression with squared error.")
 .set_body([]() { return new RegLossObj<LinearSquareLoss>(); });
 
-XGBOOST_REGISTER_OBJECTIVE(SquareLogError, SquaredLogError::Name())
-.describe("Regression with root mean squared logarithmic error.")
-.set_body([]() { return new RegLossObj<SquaredLogError>(); });
-
 XGBOOST_REGISTER_OBJECTIVE(LogisticRegression, LogisticRegression::Name())
 .describe("Logistic regression for probability regression task.")
 .set_body([]() { return new RegLossObj<LogisticRegression>(); });
@@ -253,8 +248,57 @@ XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
     return new RegLossObj<LinearSquareLoss>(); });
 // End deprecated
 
+class SquaredLogErrorRegression : public FitIntercept {
+ public:
+  static auto Name() { return SquaredLogError::Name(); }
+
+  void Configure(Args const&) override {}
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
+  [[nodiscard]] bst_target_t Targets(MetaInfo const& info) const override {
+    return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
+  }
+  void GetGradient(HostDeviceVector<bst_float> const& preds, const MetaInfo& info,
+                   std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override {
+    if (iter == 0) {
+      ValidateLabel<SquaredLogError>(this->ctx_, info);
+    }
+    auto labels = info.labels.View(ctx_->Device());
+
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+    auto gpair = out_gpair->View(ctx_->Device());
+
+    preds.SetDevice(ctx_->Device());
+    auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
+
+    info.weights_.SetDevice(ctx_->Device());
+    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
+                                                 : info.weights_.ConstDeviceSpan()};
+    linalg::ElementWiseKernel(this->ctx_, labels,
+                              [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
+                                auto p = predt(i, j);
+                                auto y = labels(i, j);
+                                auto w = weight[i];
+                                auto grad = SquaredLogError::FirstOrderGradient(p, y);
+                                auto hess = SquaredLogError::SecondOrderGradient(p, y);
+                                gpair(i) = {grad * w, hess * w};
+                              });
+  }
+  [[nodiscard]] const char* DefaultEvalMetric() const override { return "rmsle"; }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Name());
+  }
+  void LoadConfig(Json const&) override {}
+};
+
+XGBOOST_REGISTER_OBJECTIVE(SquaredLogErrorRegression, SquaredLogErrorRegression::Name())
+    .describe("Root mean squared log error.")
+    .set_body([]() { return new SquaredLogErrorRegression(); });
+
 class PseudoHuberRegression : public FitIntercept {
-  PesudoHuberParam param_;
+  PseudoHuberParam param_;
 
  public:
   void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }

From 69fe91ad8f91edbae3a12cbcdc3a4c8a1c4d6754 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 27 Jul 2025 12:46:26 +0800
Subject: [PATCH 116/224] Emit warning for text input. (#11590)

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/common/error_msg.cc | 10 ----------
 src/common/error_msg.h  |  2 --
 src/data/data.cc        |  5 +++++
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index f68cca3b8895..23e7f0fc02d2 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -19,16 +19,6 @@ namespace xgboost::error {
   return ss.str();
 }
 
-void WarnDeprecatedGPUHist() {
-  auto msg =
-      "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
-      R"(parameter to CUDA instead.
-
-    E.g. tree_method = "hist", device = "cuda"
-)";
-  LOG(WARNING) << msg;
-}
-
 void WarnManualUpdater() {
   static std::once_flag flag;
   std::call_once(flag, [] {
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 6efa56ec24b0..d1dfc84d8e4d 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -84,8 +84,6 @@ inline void WarnOldSerialization() {
   logged = true;
 }
 
-void WarnDeprecatedGPUHist();
-
 void WarnManualUpdater();
 
 void WarnEmptyDataset();
diff --git a/src/data/data.cc b/src/data/data.cc
index f6cdadd1250d..bc589dee32e4 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -931,6 +931,11 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
 
   int partid = 0, npart = 1;
 
+  static std::once_flag warning_flag;
+  std::call_once(warning_flag, []() {
+    LOG(WARNING) << "Text file input has been deprecated since 3.1";
+  });
+
   fname = data::ValidateFileFormat(fname);
   std::unique_ptr<dmlc::Parser<std::uint32_t>> parser(
       dmlc::Parser<std::uint32_t>::Create(fname.c_str(), partid, npart, "auto"));

From 86a9809c496667f094c6a467c76da8ab575844e1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 27 Jul 2025 16:57:39 +0800
Subject: [PATCH 117/224] [enc] Expose categories container into Python.
 (#11591)

---
 .gitignore                                |   1 +
 include/xgboost/data.h                    |   2 +-
 python-package/xgboost/_data_utils.py     |  54 ++++++++-
 python-package/xgboost/_typing.py         |   7 ++
 python-package/xgboost/core.py            | 111 +++++++++++--------
 python-package/xgboost/testing/ordinal.py |  66 +++++++----
 src/c_api/c_api.cc                        | 128 ++++++++++++++++++----
 src/data/cat_container.cc                 |   4 +
 src/data/cat_container.cu                 |   7 ++
 src/data/cat_container.h                  |   2 +-
 src/data/data.cc                          |   2 +-
 11 files changed, 292 insertions(+), 92 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3e1aaddaef7f..43820813baf3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
 *.Rcheck
 *.rds
 *.tar.gz
+*.tar.bz2
 *conf
 *buffer
 *.model
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 7fbee777afba..d3bc2074ad15 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -430,7 +430,7 @@ class SparsePage {
    * \return  The maximum number of columns encountered in this input batch. Useful when pushing many adapter batches to work out the total number of columns.
    */
   template <typename AdapterBatchT>
-  uint64_t Push(const AdapterBatchT& batch, float missing, int nthread);
+  bst_idx_t Push(AdapterBatchT const& batch, float missing, std::int32_t nthread);
 
   /*!
    * \brief Push a sparse page
diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
index c495fde07b14..f1f021ec1d5b 100644
--- a/python-package/xgboost/_data_utils.py
+++ b/python-package/xgboost/_data_utils.py
@@ -23,7 +23,13 @@
 
 import numpy as np
 
-from ._typing import CNumericPtr, DataType, NumpyDType, NumpyOrCupy
+from ._typing import (
+    ArrowCatList,
+    CNumericPtr,
+    DataType,
+    NumpyDType,
+    NumpyOrCupy,
+)
 from .compat import import_cupy, import_pyarrow, lazy_isinstance
 
 if TYPE_CHECKING:
@@ -480,3 +486,49 @@ def cudf_cat_inf(
 
     joffset, jdata, buf = _arrow_cat_inf(cats.to_arrow(), codes)
     return joffset, jdata, buf
+
+
+class Categories:
+    """An internal storage class for categories returned by the DMatrix and the
+    Booster. This class is designed to be opaque. It is intended to be used exclusively
+    by XGBoost as an intermediate storage for re-coding categorical data.
+
+    The categories are saved along with the booster object. As a result, users don't
+    need to preserve this class for re-coding. Use the booster model IO instead if you
+    want to preserve the categories in a stable format.
+
+    .. versionadded:: 3.1.0
+
+    .. warning::
+
+        This class is still working in progress.
+
+    """
+
+    def __init__(
+        self, handle: ctypes.c_void_p, arrow_arrays: Optional[ArrowCatList]
+    ) -> None:
+        self._handle = handle
+        self._arrow_arrays = arrow_arrays
+
+    def to_arrow(self) -> Optional[ArrowCatList]:
+        """Get the categories in the dataset. The results are stored in a list of
+        (feature name, arrow array) pairs, with one array for each categorical
+        feature. If a feature is numerical, then the corresponding column in the list is
+        None. A value error will be raised if this container was created without the
+        `export_to_arrow` option.
+
+        """
+        if self._arrow_arrays is None:
+            raise ValueError(
+                "The `export_to_arrow` option of the `get_categories` method"
+                " is required."
+            )
+        return self._arrow_arrays
+
+    def __del__(self) -> None:
+        from .core import _LIB, _check_call
+
+        assert self._handle is not None
+        _check_call(_LIB.XGBCategoriesFree(self._handle))
+        del self._handle
diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
index 97536ccfb2cb..fc610d8eae22 100644
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -13,6 +13,7 @@
     Sequence,
     Tuple,
     Type,
+    TypeAlias,
     TypeVar,
     Union,
 )
@@ -30,9 +31,15 @@
 
 ArrayLike = Any
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     PathLike = Union[str, os.PathLike[str]]
 else:
     PathLike = Union[str, os.PathLike]
+
+ArrowCatCol: TypeAlias = Optional[Union["pa.StringArray", "pa.NumericArray"]]
+ArrowCatList: TypeAlias = List[Tuple[str, Optional[ArrowCatCol]]]
+
 CupyT = ArrayLike  # maybe need a stub for cupy arrays
 NumpyOrCupy = Any
 NumpyDType = Union[str, Type[np.number]]  # pylint: disable=invalid-name
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index e6560d20f4a1..0bb7f990241c 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -38,6 +38,7 @@
 import scipy.sparse
 
 from ._data_utils import (
+    Categories,
     TransformedDf,
     array_interface,
     cuda_array_interface,
@@ -47,6 +48,7 @@
 from ._typing import (
     _T,
     ArrayLike,
+    ArrowCatList,
     BoosterParam,
     CFloatPtr,
     CNumeric,
@@ -76,9 +78,6 @@
 )
 from .libpath import find_lib_path, is_sphinx_build
 
-if TYPE_CHECKING:
-    import pyarrow as pa
-
 
 class XGBoostError(ValueError):
     """Error thrown by xgboost trainer."""
@@ -781,27 +780,24 @@ def inner_f(*args: Any, **kwargs: Any) -> _T:
 
 def _get_categories(
     cfn: Callable[[ctypes.c_char_p], int],
-    feature_names: Optional[FeatureNames],
+    feature_names: FeatureNames,
     n_features: int,
-) -> Optional[Dict[str, "pa.DictionaryArray"]]:
+) -> Optional[ArrowCatList]:
     if not is_pyarrow_available():
-        raise ImportError("`pyarrow` is required for exporting categories.")
+        raise ImportError(
+            "`pyarrow` is required for exporting categories to arrow arrays."
+        )
 
-    if TYPE_CHECKING:
-        import pyarrow as pa
-    else:
+    if not TYPE_CHECKING:
         pa = import_pyarrow()
+    else:
+        import pyarrow as pa
 
-    fnames = feature_names
-    if fnames is None:
-        fnames = [str(i) for i in range(n_features)]
-
-    results: Dict[str, "pa.DictionaryArray"] = {}
+    results: ArrowCatList = []
 
     ret = ctypes.c_char_p()
     _check_call(cfn(ret))
-    if ret.value is None:
-        return None
+    assert ret.value is not None
 
     retstr = ret.value.decode()  # pylint: disable=no-member
     jcats = json.loads(retstr)
@@ -811,19 +807,19 @@ def _get_categories(
         f_jcats = jcats[fidx]
         if f_jcats is None:
             # Numeric data
-            results[fnames[fidx]] = None
+            results.append((feature_names[fidx], None))
             continue
 
         if "offsets" not in f_jcats:
             values = from_array_interface(f_jcats)
             pa_values = pa.Array.from_pandas(values)
-            results[fnames[fidx]] = pa_values
+            results.append((feature_names[fidx], pa_values))
             continue
 
         joffsets = f_jcats["offsets"]
         jvalues = f_jcats["values"]
-        offsets = from_array_interface(joffsets, True)
-        values = from_array_interface(jvalues, True)
+        offsets = from_array_interface(joffsets)
+        values = from_array_interface(jvalues)
         pa_offsets = pa.array(offsets).buffers()
         pa_values = pa.array(values).buffers()
         assert (
@@ -832,7 +828,7 @@ def _get_categories(
         pa_dict = pa.StringArray.from_buffers(
             len(offsets) - 1, pa_offsets[1], pa_values[1]
         )
-        results[fnames[fidx]] = pa_dict
+        results.append((feature_names[fidx], pa_dict))
 
     return results
 
@@ -1346,22 +1342,41 @@ def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
         assert data.dtype == np.float32
         return indptr, data
 
-    def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
-        """Get the categories in the dataset using `pyarrow`. Returns `None` if there's
-        no categorical features.
+    def get_categories(self, export_to_arrow: bool = False) -> Categories:
+        """Get the categories in the dataset.
+
+        .. versionadded:: 3.1.0
 
         .. warning::
 
             This function is still working in progress.
 
-        .. versionadded:: 3.1.0
+        Parameters
+        ----------
+        export_to_arrow :
+            The returned container will contain a list to ``pyarrow`` arrays for the
+            categories. See the :py:meth:`~Categories.to_arrow` for more info.
 
         """
-        return _get_categories(
-            lambda ret: _LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)),
-            self.feature_names,
-            self.num_col(),
-        )
+        fnames = self.feature_names
+        n_features = self.num_col()
+        if fnames is None:
+            fnames = [str(i) for i in range(n_features)]
+
+        hdl = ctypes.c_void_p()
+        if export_to_arrow:
+            arrow_arrays = _get_categories(
+                lambda ret: _LIB.XGBDMatrixGetCategoriesExportToArrow(
+                    self.handle, ctypes.byref(hdl), ctypes.byref(ret)
+                ),
+                fnames,
+                n_features,
+            )
+        else:
+            arrow_arrays = None
+            _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))
+
+        return Categories(hdl, arrow_arrays)
 
     def num_row(self) -> int:
         """Get the number of rows in the DMatrix."""
@@ -2323,22 +2338,28 @@ def feature_names(self) -> Optional[FeatureNames]:
     def feature_names(self, features: Optional[FeatureNames]) -> None:
         self._set_feature_info(features, "feature_name")
 
-    def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
-        """Get the categories in the dataset using `pyarrow`. Returns `None` if there's
-        no categorical features.
-
-        .. warning::
-
-            This function is still working in progress.
-
-        .. versionadded:: 3.1.0
+    def get_categories(self, export_to_arrow: bool = False) -> Categories:
+        """Same method as :py:meth:`DMatrix.get_categories`."""
+
+        fnames = self.feature_names
+        n_features = self.num_features()
+        if fnames is None:
+            fnames = [str(i) for i in range(n_features)]
+
+        hdl = ctypes.c_void_p()
+        if export_to_arrow:
+            arrow_arrays = _get_categories(
+                lambda ret: _LIB.XGBoosterGetCategoriesExportToArrow(
+                    self.handle, ctypes.byref(hdl), ctypes.byref(ret)
+                ),
+                fnames,
+                n_features,
+            )
+        else:
+            arrow_arrays = None
+            _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))
 
-        """
-        return _get_categories(
-            lambda ret: _LIB.XGBoosterGetCategories(self.handle, ctypes.byref(ret)),
-            self.feature_names,
-            self.num_features(),
-        )
+        return Categories(hdl, arrow_arrays)
 
     def set_param(
         self,
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index a5c296d9424b..0ce93f482768 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -57,16 +57,15 @@ def assert_allclose(device: str, a: Any, b: Any) -> None:
 
 def comp_booster(device: Literal["cpu", "cuda"], Xy: DMatrix, booster: str) -> None:
     """Compare the results from DMatrix and Booster."""
-    cats = Xy.get_categories()
-    assert cats is not None
+    cats_dm = Xy.get_categories(export_to_arrow=True).to_arrow()
+    assert cats_dm is not None
 
     rng = np.random.default_rng(2025)
     Xy.set_label(rng.normal(size=Xy.num_row()))
     bst = train({"booster": booster, "device": device}, Xy, 1)
-    cats_bst = bst.get_categories()
+    cats_bst = bst.get_categories(export_to_arrow=True).to_arrow()
     assert cats_bst is not None
-    for k, v in cats_bst.items():
-        assert v == cats[k]
+    assert cats_dm == cats_bst
 
 
 def run_cat_container(device: Literal["cpu", "cuda"]) -> None:
@@ -79,12 +78,13 @@ def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
         categories = df.c.cat.categories
 
         Xy = DMatrixT(df, enable_categorical=True)
-        results = Xy.get_categories()
+        results = Xy.get_categories(export_to_arrow=True).to_arrow()
         assert results is not None
-        assert len(results["c"]) == len(categories)
-        for i in range(len(results["c"])):
-            assert str(results["c"][i]) == str(categories[i]), (
-                results["c"][i],
+        results_di = dict(results)
+        assert len(results_di["c"]) == len(categories)
+        for i in range(len(results_di["c"])):
+            assert str(results_di["c"][i]) == str(categories[i]), (
+                results_di["c"][i],
                 categories[i],
             )
 
@@ -92,9 +92,10 @@ def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
         df = Df({"c": ["cdef", None, "abc", "abc"]}, dtype="category")
         Xy = DMatrixT(df, enable_categorical=True)
 
-        cats = Xy.get_categories()
+        cats = Xy.get_categories(export_to_arrow=True).to_arrow()
         assert cats is not None
-        ser = cats["c"].to_pandas()
+        cats_id = dict(cats)
+        ser = cats_id["c"].to_pandas()
         assert ser.iloc[0] == "abc"
         assert ser.iloc[1] == "cdef"
         assert ser.size == 2
@@ -115,6 +116,9 @@ def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
         comp_booster(device, Xy, "gbtree")
         comp_booster(device, Xy, "dart")
 
+        with pytest.raises(ValueError, match="export_to_arrow"):
+            Xy.get_categories(export_to_arrow=False).to_arrow()
+
     for dm in (DMatrix, QuantileDMatrix):
         run_dispatch(device, dm)
 
@@ -134,12 +138,15 @@ def is_cudf_cat(_: Any) -> bool:
     n_samples = int(2**10)
 
     def check(Xy: DMatrix, X: pd.DataFrame) -> None:
-        cats = Xy.get_categories()
+        cats = Xy.get_categories(export_to_arrow=True).to_arrow()
         assert cats is not None
+        cats_di = dict(cats)
 
         for fname in X.columns:
             if is_pd_cat_dtype(X[fname].dtype) or is_cudf_cat(X[fname].dtype):
-                aw_list = sorted(cats[fname].to_pylist())
+                vf = cats_di[fname]
+                assert vf is not None
+                aw_list = sorted(vf.to_pylist())
                 if is_cudf_cat(X[fname].dtype):
                     pd_list: list = X[fname].unique().to_arrow().to_pylist()
                 else:
@@ -151,7 +158,7 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
                 pd_list = sorted(pd_list)
                 assert aw_list == pd_list
             else:
-                assert cats[fname] is None
+                assert cats_di[fname] is None
 
         if not hasattr(Xy, "ref"):  # not quantile DMatrix.
             assert not isinstance(Xy, QuantileDMatrix)
@@ -160,14 +167,16 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
                 Xy.save_binary(fname)
 
                 Xy_1 = DMatrix(fname)
-                cats_1 = Xy_1.get_categories()
+                cats_1 = Xy_1.get_categories(export_to_arrow=True).to_arrow()
                 assert cats_1 is not None
+                cats_1_di = dict(cats_1)
 
-                for k, v_0 in cats.items():
-                    v_1 = cats_1[k]
+                for k, v_0 in cats_di.items():
+                    v_1 = cats_1_di[k]
                     if v_0 is None:
                         assert v_1 is None
                     else:
+                        assert v_1 is not None
                         assert v_0.to_pylist() == v_1.to_pylist()
 
         comp_booster(device, Xy, "gbtree")
@@ -245,14 +254,21 @@ def run_dispatch(DMatrixT: Type) -> None:
     for dm in (DMatrix, QuantileDMatrix):
         run_dispatch(dm)
 
+    # No category
     batches = make_batches(
         n_samples_per_batch=128, n_features=4, n_batches=1, use_cupy=device == "cuda"
     )
     X, y, w = map(lambda x: x[0], batches)
-    Xy = DMatrix(X, y, weight=w)
-    assert Xy.get_categories() is None
-    Xy = QuantileDMatrix(X, y, weight=w)
-    assert Xy.get_categories() is None
+
+    for DMatrixT in (DMatrix, QuantileDMatrix):
+        Xy = DMatrixT(X, y, weight=w)
+        all_num = Xy.get_categories(export_to_arrow=True).to_arrow()
+        assert all_num is not None
+        for _, v in all_num:
+            assert v is None
+
+        with pytest.raises(ValueError, match="export_to_arrow"):
+            Xy.get_categories(export_to_arrow=False).to_arrow()
 
 
 def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
@@ -279,9 +295,11 @@ def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
     it = IteratorForTest(X, y, None, cache="cache", on_host=device == "cuda")
 
     Xy = ExtMemQuantileDMatrix(it, enable_categorical=True)
-    cats = Xy.get_categories()
+    cats = Xy.get_categories(export_to_arrow=True).to_arrow()
     assert cats is not None and len(cats) == n_features
-    for _, v in cats.items():
+    cats_di = dict(cats)
+    for _, v in cats_di.items():
+        assert v is not None
         assert v.null_count == 0
         assert len(v) == n_cats
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index cd77be4a48cc..f86f7838536a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -710,15 +710,15 @@ template <typename FidxT>
 void GetCategoriesImpl(enc::HostColumnsView const &cats, FidxT n_features,
                        std::string *p_out_storage, char const **out) {
   auto &ret_str = *p_out_storage;
-  if (cats.Empty()) {
-    ret_str.clear();
-    *out = nullptr;
-    return;
-  }
+  ret_str.clear();
 
   // We can directly use the storage in the cat container instead of allocating temporary storage.
   Json jout{Array{}};
   for (decltype(n_features) f_idx = 0; f_idx < n_features; ++f_idx) {
+    if (cats.Empty()) {
+      get<Array>(jout).emplace_back();
+      continue;
+    }
     auto const &col = cats[f_idx];
     if (std::visit([](auto &&arg) { return arg.empty(); }, col)) {
       get<Array>(jout).emplace_back();
@@ -749,25 +749,92 @@ void GetCategoriesImpl(enc::HostColumnsView const &cats, FidxT n_features,
 
   *out = ret_str.c_str();
 }
+
+CatContainer *CopyCatContainer(Context const *ctx, CatContainer const *cats,
+                               bst_feature_t n_features) {
+  CatContainer *new_cats = new CatContainer{};
+  new_cats->Copy(ctx, *cats);
+  CHECK_EQ(new_cats->Empty(), cats->Empty());
+  if (!new_cats->Empty()) {
+    CHECK_EQ(new_cats->NumFeatures(), n_features);
+    CHECK_EQ(new_cats->NumFeatures(), cats->NumFeatures());
+  }
+  return new_cats;
+}
 }  // anonymous namespace
 
+typedef  void * CategoriesHandle;  // NOLINT
+
 /**
- * Experimental (3.1), hidden.
+ * Fetching categories is experimental (3.1), C functions are hidden at the moment.
+ *
+ * No actual container method is exposed through the C API. It's just an opaque handler at
+ * the moment. This way we get to reuse the methods and the context from the DMatrix and
+ * Booster.
  */
-XGB_DLL int XGBDMatrixGetCategories(DMatrixHandle handle, char const **out) {
+/**
+ * @brief Create an opaque handle to the internal container.
+ *
+ * @param handle An instance of the data matrix.
+ * @param out     Created handle to the category container
+ *
+ * @return 0 when success, -1 when failure happens.
+ */
+XGB_DLL int XGBDMatrixGetCategories(DMatrixHandle handle, CategoriesHandle *out) {
   API_BEGIN()
   CHECK_HANDLE()
-  auto const p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
-  auto const cats = p_fmat->Cats()->HostView();
-  auto n_features = p_fmat->Info().num_col_;
 
-  auto &ret_str = p_fmat->GetThreadLocal().ret_str;
+  auto const p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
+  auto const cats = p_fmat->Cats();
+  auto new_cats = CopyCatContainer(p_fmat->Ctx(), cats, p_fmat->Info().num_col_);
   xgboost_CHECK_C_ARG_PTR(out);
-
-  GetCategoriesImpl(cats, n_features, &ret_str, out);
+  *out = new_cats;
 
   API_END()
 }
+/**
+ * @brief Create an opaque handle to the internal container and export it to arrow.
+ *
+ * @param handle     An instance of the data matrix.
+ * @param out        Created handle to the category container
+ * @param export_out JSON encoded array of categories, with length equal to the number of features.
+ *
+ * @return 0 when success, -1 when failure happens.
+ */
+XGB_DLL int XGBDMatrixGetCategoriesExportToArrow(DMatrixHandle handle, CategoriesHandle *out,
+                                                 char const **export_out) {
+  API_BEGIN();
+  CHECK_HANDLE()
+
+  auto const p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
+  auto const cats = p_fmat->Cats();
+  auto n_features = p_fmat->Info().num_col_;
+  // Create a new container
+  auto new_cats = CopyCatContainer(p_fmat->Ctx(), cats, n_features);
+  xgboost_CHECK_C_ARG_PTR(out);
+  *out = new_cats;
+  // Export to arrow
+  auto &ret_str = p_fmat->GetThreadLocal().ret_str;
+  xgboost_CHECK_C_ARG_PTR(export_out);
+  GetCategoriesImpl(new_cats->HostView(), n_features, &ret_str, export_out);
+
+  API_END();
+}
+/**
+ * @brief Free the opaque handle.
+ *
+ * @param handle An instance of the category container.
+ *
+ * @return 0 when success, -1 when failure happens.
+ */
+XGB_DLL int XGBCategoriesFree(CategoriesHandle handle) {
+  API_BEGIN();
+  xgboost_CHECK_C_ARG_PTR(handle);
+  auto p_cats = static_cast<CatContainer *>(handle);
+  CHECK(p_cats);
+  delete p_cats;
+  API_END();
+}
 
 XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void const *data,
                                   xgboost::bst_ulong size, int type) {
@@ -1691,17 +1758,40 @@ XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle,
 /**
  * Experimental (3.1), hidden.
  */
-XGB_DLL int XGBoosterGetCategories(BoosterHandle handle, char const **out) {
+/**
+ * See @ref XGBDMatrixGetCategories
+ */
+XGB_DLL int XGBoosterGetCategories(DMatrixHandle handle, CategoriesHandle *out) {
   API_BEGIN()
   CHECK_HANDLE()
-  auto *bst = static_cast<Learner *>(handle);
-  auto const cats = bst->Cats()->HostView();
-  auto n_features = bst->GetNumFeature();
 
-  auto &ret_str = bst->GetThreadLocal().ret_str;
+  auto *bst = static_cast<Learner *>(handle);
+  auto const cats = bst->Cats();
+  auto new_cats = CopyCatContainer(bst->Ctx(), cats, bst->GetNumFeature());
   xgboost_CHECK_C_ARG_PTR(out);
+  *out = new_cats;
+
+  API_END()
+}
+/**
+ * See @ref XGBDMatrixGetCategoriesExportToArrow
+ */
+XGB_DLL int XGBoosterGetCategoriesExportToArrow(BoosterHandle handle, CategoriesHandle *out,
+                                                char const **export_out) {
+  API_BEGIN()
+  CHECK_HANDLE()
 
-  GetCategoriesImpl(cats, n_features, &ret_str, out);
+  auto *bst = static_cast<Learner *>(handle);
+  auto const cats = bst->Cats();
+  auto n_features = bst->GetNumFeature();
+  // Create a new container
+  auto new_cats = CopyCatContainer(bst->Ctx(), cats, n_features);
+  xgboost_CHECK_C_ARG_PTR(out);
+  *out = new_cats;
+  // Export to arrow
+  auto &ret_str = bst->GetThreadLocal().ret_str;
+  xgboost_CHECK_C_ARG_PTR(export_out);
+  GetCategoriesImpl(new_cats->HostView(), n_features, &ret_str, export_out);
 
   API_END()
 }
diff --git a/src/data/cat_container.cc b/src/data/cat_container.cc
index d53eedf70fe9..ecfa0e23f893 100644
--- a/src/data/cat_container.cc
+++ b/src/data/cat_container.cc
@@ -249,6 +249,10 @@ void CatContainer::Copy(Context const* ctx, CatContainer const& that) {
 
 [[nodiscard]] bool CatContainer::Empty() const { return this->cpu_impl_->columns.empty(); }
 
+[[nodiscard]] std::size_t CatContainer::NumFeatures() const {
+  return this->cpu_impl_->columns.size();
+}
+
 void CatContainer::Sort(Context const* ctx) {
   CHECK(ctx->IsCPU());
   auto view = this->HostView();
diff --git a/src/data/cat_container.cu b/src/data/cat_container.cu
index 64f206528cc6..eec833a2897c 100644
--- a/src/data/cat_container.cu
+++ b/src/data/cat_container.cu
@@ -208,6 +208,13 @@ void CatContainer::Copy(Context const* ctx, CatContainer const& that) {
   return this->HostCanRead() ? this->cpu_impl_->columns.empty() : this->cu_impl_->columns.empty();
 }
 
+[[nodiscard]] std::size_t CatContainer::NumFeatures() const {
+  if (this->HostCanRead()) {
+    return this->cpu_impl_->columns.size();
+  }
+  return this->cu_impl_->columns.size();
+}
+
 void CatContainer::Sort(Context const* ctx) {
   if (!this->HasCategorical()) {
     return;
diff --git a/src/data/cat_container.h b/src/data/cat_container.h
index 1990e51a81f8..d80e7b07b6e5 100644
--- a/src/data/cat_container.h
+++ b/src/data/cat_container.h
@@ -160,7 +160,7 @@ class CatContainer {
    */
   [[nodiscard]] bool Empty() const;
 
-  [[nodiscard]] std::size_t NumFeatures() const { return this->cpu_impl_->columns.size(); }
+  [[nodiscard]] std::size_t NumFeatures() const;
   /**
    * @brief The number of categories across all features.
    */
diff --git a/src/data/data.cc b/src/data/data.cc
index bc589dee32e4..bcfd5f06a072 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -1110,7 +1110,7 @@ void SparsePage::Push(const SparsePage &batch) {
 }
 
 template <typename AdapterBatchT>
-uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread) {
+bst_idx_t SparsePage::Push(AdapterBatchT const& batch, float missing, std::int32_t nthread) {
   constexpr bool kIsRowMajor = AdapterBatchT::kIsRowMajor;
   // Allow threading only for row-major case as column-major requires O(nthread*batch_size) memory
   nthread = kIsRowMajor ? nthread : 1;

From ad2e71518ea742348f5893420b9fcfb4d78d9761 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 29 Jul 2025 03:06:38 +0800
Subject: [PATCH 118/224] Small cleanup to the proxy dmatrix. (#11594)

- Remove the auto-dispatch for CUDA methods.
- Consistent naming.
- Cleanup C API.
---
 src/c_api/c_api.cc                        | 63 ++++++++++-------------
 src/c_api/c_api.cu                        | 26 ++++++----
 src/data/proxy_dmatrix.cc                 | 17 +++---
 src/data/proxy_dmatrix.cu                 |  8 +--
 src/data/proxy_dmatrix.h                  | 44 ++++++----------
 tests/cpp/data/test_proxy_dmatrix.cc      | 11 ++--
 tests/cpp/data/test_proxy_dmatrix.cu      | 12 +++--
 tests/cpp/gbm/test_gblinear.cu            |  2 +-
 tests/cpp/gbm/test_gbtree.cc              |  8 +--
 tests/cpp/gbm/test_gbtree.cu              |  7 ++-
 tests/cpp/plugin/test_sycl_predictor.cc   |  2 +-
 tests/cpp/predictor/test_cpu_predictor.cc |  6 +--
 tests/cpp/predictor/test_gpu_predictor.cu |  6 +--
 tests/cpp/predictor/test_predictor.cc     |  4 +-
 14 files changed, 103 insertions(+), 113 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f86f7838536a..c96e47dcfbb1 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -422,52 +422,45 @@ XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle *out) {
   API_END();
 }
 
-XGB_DLL int XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
-                                                    char const *c_interface_str) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  xgboost_CHECK_C_ARG_PTR(c_interface_str);
+namespace {
+[[nodiscard]] xgboost::data::DMatrixProxy *GetDMatrixProxy(DMatrixHandle handle) {
   auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
   CHECK(p_m);
   auto m = static_cast<xgboost::data::DMatrixProxy *>(p_m->get());
   CHECK(m) << "Current DMatrix type does not support set data.";
-  m->SetCUDAArray(c_interface_str);
+  return m;
+}
+}  // namespace
+
+XGB_DLL int XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle, char const *data) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  xgboost_CHECK_C_ARG_PTR(data);
+  GetDMatrixProxy(handle)->SetCudaArray(data);
   API_END();
 }
 
-XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, char const *c_interface_str) {
+XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, char const *data) {
   API_BEGIN();
   CHECK_HANDLE();
-  xgboost_CHECK_C_ARG_PTR(c_interface_str);
-  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
-  CHECK(p_m);
-  auto m = static_cast<xgboost::data::DMatrixProxy *>(p_m->get());
-  CHECK(m) << "Current DMatrix type does not support set data.";
-  m->SetCUDAArray(c_interface_str);
+  xgboost_CHECK_C_ARG_PTR(data);
+  GetDMatrixProxy(handle)->SetCudaColumnar(data);
   API_END();
 }
 
-XGB_DLL int XGProxyDMatrixSetDataColumnar(DMatrixHandle handle, char const *c_interface_str) {
+XGB_DLL int XGProxyDMatrixSetDataColumnar(DMatrixHandle handle, char const *data) {
   API_BEGIN();
   CHECK_HANDLE();
-  xgboost_CHECK_C_ARG_PTR(c_interface_str);
-  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
-  CHECK(p_m);
-  auto m = static_cast<xgboost::data::DMatrixProxy *>(p_m->get());
-  CHECK(m) << "Current DMatrix type does not support set data.";
-  m->SetColumnarData(c_interface_str);
+  xgboost_CHECK_C_ARG_PTR(data);
+  GetDMatrixProxy(handle)->SetColumnar(data);
   API_END();
 }
 
-XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, char const *c_interface_str) {
+XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, char const *data) {
   API_BEGIN();
   CHECK_HANDLE();
-  xgboost_CHECK_C_ARG_PTR(c_interface_str);
-  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
-  CHECK(p_m);
-  auto m = static_cast<xgboost::data::DMatrixProxy *>(p_m->get());
-  CHECK(m) << "Current DMatrix type does not support set data.";
-  m->SetArrayData(c_interface_str);
+  xgboost_CHECK_C_ARG_PTR(data);
+  GetDMatrixProxy(handle)->SetArray(data);
   API_END();
 }
 
@@ -478,11 +471,7 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr, c
   xgboost_CHECK_C_ARG_PTR(indptr);
   xgboost_CHECK_C_ARG_PTR(indices);
   xgboost_CHECK_C_ARG_PTR(data);
-  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
-  CHECK(p_m);
-  auto m = static_cast<xgboost::data::DMatrixProxy *>(p_m->get());
-  CHECK(m) << "Current DMatrix type does not support set data.";
-  m->SetCSRData(indptr, indices, data, ncol, true);
+  GetDMatrixProxy(handle)->SetCsr(indptr, indices, data, ncol, true);
   API_END();
 }
 
@@ -1402,7 +1391,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
   *out_shape = dmlc::BeginPtr(shape);
 }
 
-XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *array_interface,
+XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *data,
                                       char const *c_json_config, DMatrixHandle m,
                                       xgboost::bst_ulong const **out_shape,
                                       xgboost::bst_ulong *out_dim, const float **out_result) {
@@ -1416,8 +1405,8 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *array_in
   }
   auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
   CHECK(proxy) << "Invalid input type for inplace predict.";
-  xgboost_CHECK_C_ARG_PTR(array_interface);
-  proxy->SetArrayData(array_interface);
+  xgboost_CHECK_C_ARG_PTR(data);
+  proxy->SetArray(data);
   auto *learner = static_cast<xgboost::Learner *>(handle);
   InplacePredictImpl(p_m, c_json_config, learner, out_shape, out_dim, out_result);
   API_END();
@@ -1438,7 +1427,7 @@ XGB_DLL int XGBoosterPredictFromColumnar(BoosterHandle handle, char const *array
   auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
   CHECK(proxy) << "Invalid input type for inplace predict.";
   xgboost_CHECK_C_ARG_PTR(array_interface);
-  proxy->SetColumnarData(array_interface);
+  proxy->SetColumnar(array_interface);
   auto *learner = static_cast<xgboost::Learner *>(handle);
   InplacePredictImpl(p_m, c_json_config, learner, out_shape, out_dim, out_result);
   API_END();
@@ -1460,7 +1449,7 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
   auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
   CHECK(proxy) << "Invalid input type for inplace predict.";
   xgboost_CHECK_C_ARG_PTR(indptr);
-  proxy->SetCSRData(indptr, indices, data, cols, true);
+  proxy->SetCsr(indptr, indices, data, cols, true);
   auto *learner = static_cast<xgboost::Learner *>(handle);
   InplacePredictImpl(p_m, c_json_config, learner, out_shape, out_dim, out_result);
   API_END();
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 712f1e2b9a2e..9d4f42d4d470 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -138,10 +138,10 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
   API_END();
 }
 
-int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
-                       char const *c_json_config, std::shared_ptr<DMatrix> p_m,
-                       xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
-                       const float **out_result) {
+template <bool is_columnar>
+int InplacePreidctCUDA(BoosterHandle handle, char const *data, char const *c_json_config,
+                       std::shared_ptr<DMatrix> p_m, xgboost::bst_ulong const **out_shape,
+                       xgboost::bst_ulong *out_dim, const float **out_result) {
   API_BEGIN();
   CHECK_HANDLE();
   if (!p_m) {
@@ -149,8 +149,13 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
   }
   auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
   CHECK(proxy) << "Invalid input type for inplace predict.";
+  xgboost_CHECK_C_ARG_PTR(data);
 
-  proxy->SetCUDAArray(c_array_interface);
+  if constexpr (is_columnar) {
+    proxy->SetCudaColumnar(data);
+  } else {
+    proxy->SetCudaArray(data);
+  }
 
   auto config = Json::Load(StringView{c_json_config});
   auto *learner = static_cast<Learner *>(handle);
@@ -184,7 +189,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
   API_END();
 }
 
-XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c_json_strs,
+XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *data,
                                              char const *c_json_config, DMatrixHandle m,
                                              xgboost::bst_ulong const **out_shape,
                                              xgboost::bst_ulong *out_dim,
@@ -194,11 +199,10 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
   if (m) {
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
-  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
-                            out_result);
+  return InplacePreidctCUDA<true>(handle, data, c_json_config, p_m, out_shape, out_dim, out_result);
 }
 
-XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_json_strs,
+XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *data,
                                           char const *c_json_config, DMatrixHandle m,
                                           xgboost::bst_ulong const **out_shape,
                                           xgboost::bst_ulong *out_dim, const float **out_result) {
@@ -207,6 +211,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
   xgboost_CHECK_C_ARG_PTR(out_result);
-  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
-                            out_result);
+  return InplacePreidctCUDA<false>(handle, data, c_json_config, p_m, out_shape, out_dim,
+                                   out_result);
 }
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index fec64da0f364..f715b291d5f5 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 
 #include "proxy_dmatrix.h"
@@ -16,23 +16,23 @@
 #endif
 
 namespace xgboost::data {
-void DMatrixProxy::SetColumnarData(StringView interface_str) {
-  std::shared_ptr<ColumnarAdapter> adapter{new ColumnarAdapter{interface_str}};
+void DMatrixProxy::SetColumnar(StringView data) {
+  std::shared_ptr<ColumnarAdapter> adapter{new ColumnarAdapter{data}};
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
   this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
-void DMatrixProxy::SetArrayData(StringView interface_str) {
-  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
+void DMatrixProxy::SetArray(StringView data) {
+  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{data}};
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
   this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
-void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
+void DMatrixProxy::SetCsr(char const *c_indptr, char const *c_indices, char const *c_values,
                               bst_feature_t n_features, bool on_host) {
   CHECK(on_host) << "Not implemented on device.";
   std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
@@ -43,6 +43,11 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char
   this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
+#if !defined(XGBOOST_USE_CUDA)
+void DMatrixProxy::SetCudaArray(StringView) { common::AssertGPUSupport(); }
+void DMatrixProxy::SetCudaColumnar(StringView) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 namespace cuda_impl {
 std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
                                                 std::shared_ptr<DMatrixProxy> proxy, float missing);
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index 9ad9ea227272..71095da33da3 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -7,8 +7,8 @@
 #include "proxy_dmatrix.h"
 
 namespace xgboost::data {
-void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
-  auto adapter{std::make_shared<CudfAdapter>(interface_str)};
+void DMatrixProxy::SetCudaColumnar(StringView data) {
+  auto adapter{std::make_shared<CudfAdapter>(data)};
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
@@ -21,8 +21,8 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
   ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 
-void DMatrixProxy::FromCudaArray(StringView interface_str) {
-  auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
+void DMatrixProxy::SetCudaArray(StringView data) {
+  auto adapter(std::make_shared<CupyAdapter>(StringView{data}));
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 9fdd5eef3488..bc77d4368e69 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -65,51 +65,39 @@ class DataIterProxy {
 };
 
 /**
- * @brief A proxy of DMatrix used by external iterator.
+ * @brief A proxy of DMatrix used by the external iterator.
  */
 class DMatrixProxy : public DMatrix {
   MetaInfo info_;
   std::any batch_;
   Context ctx_;
 
-#if defined(XGBOOST_USE_CUDA)
-  void FromCudaColumnar(StringView interface_str);
-  void FromCudaArray(StringView interface_str);
-#endif  // defined(XGBOOST_USE_CUDA)
-
  public:
   DeviceOrd Device() const { return ctx_.Device(); }
 
-  void SetCUDAArray(char const* c_interface) {
-    common::AssertGPUSupport();
-    CHECK(c_interface);
-#if defined(XGBOOST_USE_CUDA)
-    StringView interface_str{c_interface};
-    Json json_array_interface = Json::Load(interface_str);
-    if (IsA<Array>(json_array_interface)) {
-      this->FromCudaColumnar(interface_str);
-    } else {
-      this->FromCudaArray(interface_str);
-    }
-#endif  // defined(XGBOOST_USE_CUDA)
-  }
-
-  void SetColumnarData(StringView interface_str);
-
-  void SetArrayData(StringView interface_str);
-  void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
+  /**
+   * Device setters
+   */
+  void SetCudaColumnar(StringView data);
+  void SetCudaArray(StringView data);
+  /**
+   * Host setters
+   */
+  void SetColumnar(StringView data);
+  void SetArray(StringView data);
+  void SetCsr(char const* c_indptr, char const* c_indices, char const* c_values,
                   bst_feature_t n_features, bool on_host);
 
   MetaInfo& Info() override { return info_; }
   MetaInfo const& Info() const override { return info_; }
   Context const* Ctx() const override { return &ctx_; }
 
-  bool EllpackExists() const override { return false; }
-  bool GHistIndexExists() const override { return false; }
-  bool SparsePageExists() const override { return false; }
+  [[nodiscard]] bool EllpackExists() const override { return false; }
+  [[nodiscard]] bool GHistIndexExists() const override { return false; }
+  [[nodiscard]] bool SparsePageExists() const override { return false; }
 
   template <typename Page>
-  BatchSet<Page> NoBatch() {
+  static BatchSet<Page> NoBatch() {
     LOG(FATAL) << "Proxy DMatrix cannot return data batch.";
     return BatchSet<Page>(BatchIterator<Page>(nullptr));
   }
diff --git a/tests/cpp/data/test_proxy_dmatrix.cc b/tests/cpp/data/test_proxy_dmatrix.cc
index 996836ed6ab2..c70e95053e20 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cc
+++ b/tests/cpp/data/test_proxy_dmatrix.cc
@@ -1,23 +1,26 @@
 /**
- * Copyright 2021-2023, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
-#include "../../../src/data/adapter.h"
+#include <cstddef>  // for size_t
+#include <vector>   // for vector
+
 #include "../../../src/data/proxy_dmatrix.h"
 #include "../helpers.h"
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
 
 namespace xgboost::data {
 TEST(ProxyDMatrix, HostData) {
   DMatrixProxy proxy;
-  size_t constexpr kRows = 100, kCols = 10;
+  std::size_t constexpr kRows = 100, kCols = 10;
   std::vector<HostDeviceVector<float>> label_storage(1);
 
   HostDeviceVector<float> storage;
   auto data =
       RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
 
-  proxy.SetArrayData(data.c_str());
+  proxy.SetArray(data.c_str());
 
   auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
   ASSERT_EQ(n_samples, kRows);
diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu
index e7780951c8bc..1762759ad095 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -1,15 +1,17 @@
 /**
- * Copyright 2020-2023 XGBoost contributors
+ * Copyright 2020-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
 
-#include <any>  // for any_cast
-#include <memory>
+#include <any>     // for any_cast
+#include <memory>  // for shared_ptr
+#include <vector>  // for vector
 
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/proxy_dmatrix.h"
 #include "../helpers.h"
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
 
 namespace xgboost::data {
 TEST(ProxyDMatrix, DeviceData) {
@@ -23,7 +25,7 @@ TEST(ProxyDMatrix, DeviceData) {
                     .GenerateColumnarArrayInterface(&label_storage);
 
   DMatrixProxy proxy;
-  proxy.SetCUDAArray(data.c_str());
+  proxy.SetCudaArray(data.c_str());
   proxy.SetInfo("label", labels.c_str());
 
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CupyAdapter>));
@@ -35,7 +37,7 @@ TEST(ProxyDMatrix, DeviceData) {
   data = RandomDataGenerator(kRows, kCols, 0)
              .Device(FstCU())
              .GenerateColumnarArrayInterface(&columnar_storage);
-  proxy.SetCUDAArray(data.c_str());
+  proxy.SetCudaColumnar(data.c_str());
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
   ASSERT_EQ(std::any_cast<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumRows(), kRows);
   ASSERT_EQ(std::any_cast<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumColumns(), kCols);
diff --git a/tests/cpp/gbm/test_gblinear.cu b/tests/cpp/gbm/test_gblinear.cu
index b158fb32b2c4..94abce062584 100644
--- a/tests/cpp/gbm/test_gblinear.cu
+++ b/tests/cpp/gbm/test_gblinear.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/global_config.h>  // for GlobalConfigThreadLocalStore
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 80f217ac2505..c682fed7b0aa 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost contributors
+ * Copyright 2019-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
@@ -401,9 +401,9 @@ class Dart : public testing::TestWithParam<char const*> {
     HostDeviceVector<float>* inplace_predts;
     std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
     if (ctx.IsCUDA()) {
-      x->SetCUDAArray(array_str.c_str());
+      x->SetCudaArray(array_str.c_str());
     } else {
-      x->SetArrayData(array_str.c_str());
+      x->SetArray(array_str.c_str());
     }
     learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                             &inplace_predts, 0, 0);
@@ -628,7 +628,7 @@ TEST(GBTree, PredictRange) {
     HostDeviceVector<float> raw_storage;
     auto raw = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateArrayInterface(&raw_storage);
     std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
-    x->SetArrayData(raw.data());
+    x->SetArray(raw.data());
 
     HostDeviceVector<float>* out_predt;
     learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 227e07ffd3fd..1ef909a29720 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #include <xgboost/context.h>      // for Context
 #include <xgboost/learner.h>      // for Learner
@@ -8,7 +8,6 @@
 #include <limits>  // for numeric_limits
 #include <memory>  // for shared_ptr
 #include <string>  // for string
-#include <thread>  // for thread
 
 #include "../../../src/data/adapter.h"           // for ArrayAdapter
 #include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
@@ -50,9 +49,9 @@ void TestInplaceFallback(Context const* ctx) {
   std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
   auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
   if (data_ordinal.IsCPU()) {
-    proxy->SetArrayData(StringView{X});
+    proxy->SetArray(StringView{X});
   } else {
-    proxy->SetCUDAArray(X.c_str());
+    proxy->SetCudaArray(X.c_str());
   }
 
   HostDeviceVector<float>* out_predt{nullptr};
diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc
index a7ec51594e08..a881e679f29b 100755
--- a/tests/cpp/plugin/test_sycl_predictor.cc
+++ b/tests/cpp/plugin/test_sycl_predictor.cc
@@ -58,7 +58,7 @@ TEST(SyclPredictor, InplacePredict) {
     auto array_interface = GetArrayInterface(&data, kRows, kCols);
     std::string arr_str;
     Json::Dump(array_interface, &arr_str);
-    x->SetArrayData(arr_str.data());
+    x->SetArray(arr_str.data());
     TestInplacePrediction(&ctx, x, kRows, kCols);
   }
 }
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 2a1b43bf730f..de9309c358af 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/predictor.h>
@@ -80,7 +80,7 @@ TEST(CpuPredictor, InplacePredict) {
     auto array_interface = GetArrayInterface(&data, kRows, kCols);
     std::string arr_str;
     Json::Dump(array_interface, &arr_str);
-    x->SetArrayData(arr_str.data());
+    x->SetArray(arr_str.data());
     TestInplacePrediction(&ctx, x, kRows, kCols);
   }
 
@@ -97,7 +97,7 @@ TEST(CpuPredictor, InplacePredict) {
     Json::Dump(rptr_interface, &rptr_str);
     Json::Dump(col_interface, &col_str);
     std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy};
-    x->SetCSRData(rptr_str.data(), col_str.data(), data_str.data(), kCols, true);
+    x->SetCsr(rptr_str.data(), col_str.data(), data_str.data(), kCols, true);
     TestInplacePrediction(&ctx, x, kRows, kCols);
   }
 }
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 11c9d4946455..b66610c108af 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/c_api.h>
@@ -203,7 +203,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
-  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
+  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCudaArray(interface_str.c_str());
   TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }
 
@@ -215,7 +215,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
-  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
+  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCudaColumnar(interface_str.c_str());
   TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }
 
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 121b7a768bc4..7cee9b292fd2 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -643,7 +643,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
   auto array_interface = GetArrayInterface(&with_nan, kRows, kCols);
   std::string arr_str;
   Json::Dump(array_interface, &arr_str);
-  dynamic_cast<data::DMatrixProxy *>(dense.get())->SetArrayData(arr_str.data());
+  dynamic_cast<data::DMatrixProxy *>(dense.get())->SetArray(arr_str.data());
   HostDeviceVector<float> *p_dense_predt;
   learner->InplacePredict(dense, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                           &p_dense_predt, 0, 0);
@@ -775,7 +775,7 @@ void TestVectorLeafPrediction(Context const *ctx) {
       std::string str;
       Json::Dump(arr, &str);
       auto proxy = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
-      dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetArrayData(str.data());
+      dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetArray(str.data());
       cpu_predictor->InplacePredict(proxy, model, std::numeric_limits<float>::quiet_NaN(),
                                     &predt_cache, 0, 1);
       auto const &h_predt = predt_cache.predictions.HostVector();

From 94020a1984259aa0f21a82e3c7488e5724ca8b18 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 29 Jul 2025 15:26:00 -0700
Subject: [PATCH 119/224] Fix wheel upload with WoA (#11597)

* [WIP] Debug wheel build with WoA

* Add back conditional
---
 .github/workflows/python_wheels_winarm64.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python_wheels_winarm64.yml b/.github/workflows/python_wheels_winarm64.yml
index 4d9b0270a95a..aaeaac08e274 100644
--- a/.github/workflows/python_wheels_winarm64.yml
+++ b/.github/workflows/python_wheels_winarm64.yml
@@ -34,7 +34,7 @@ jobs:
       - name: Install build dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install wheel setuptools
+          python -m pip install wheel setuptools awscli
 
       - name: Build XGBoost for Win-ARM64
         run: |
@@ -56,10 +56,11 @@ jobs:
       - name: Upload Python wheel
         if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
         run: |
+          $wheelFile = Get-ChildItem python-package/wheelhouse/*.whl | Select-Object -First 1 -ExpandProperty FullName
           python ops/pipeline/manage-artifacts.py upload `
             --s3-bucket xgboost-nightly-builds `
             --prefix ${{ env.BRANCH_NAME }}/${{ github.sha }} --make-public `
-            python-package/wheelhouse/*.whl
+            $wheelFile
         env:
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}

From ab0c71480c2090ad1e6fe264782b4a7e628dd22e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 31 Jul 2025 14:53:41 +0800
Subject: [PATCH 120/224] [doc] Update release news for 3.0.x. (#11601)

---
 doc/changes/v3.0.0.rst          | 35 +++++++++++++++++++++++++++++++++
 ops/script/release_artifacts.py |  1 +
 2 files changed, 36 insertions(+)

diff --git a/doc/changes/v3.0.0.rst b/doc/changes/v3.0.0.rst
index 58f7cf8654fd..aa37ff05f8b0 100644
--- a/doc/changes/v3.0.0.rst
+++ b/doc/changes/v3.0.0.rst
@@ -1,3 +1,38 @@
+#################################
+3.0.3 Patch Release (Jul 30 2025)
+#################################
+
+- Fix NDCG metric with non-exp gain. (:pr:`11534`)
+- Avoid using mean intercept for ``rmsle``. (:pr:`11588`)
+- [jvm-packages] add ``setNumEarlyStoppingRounds`` API (:pr:`11571`)
+- Avoid implicit synchronization in GPU evaluation. (:pr:`11542`)
+- Remove CUDA check in the array interface handler (:pr:`11386`)
+- Fix check in GPU histogram. (:pr:`11574`)
+- Support Rapids 25.06 (:pr:`11504`)
+- Adding ``enable_categorical`` to the sklearn ``.apply`` method (:pr:`11550`)
+- Make xgboost.testing compatible with scikit-learn 1.7 (:pr:`11502`)
+- Add support for building xgboost wheels on Win-ARM64 (:pr:`11572`, :pr:`11597`, :pr:`11559`)
+
+#################################
+3.0.2 Patch Release (May 25 2025)
+#################################
+
+- Dask 2025.4.0 scheduler info compatibility fix (:pr:`11462`)
+- Fix CUDA virtual memory fallback logic on WSL2 (:pr:`11471`)
+
+#################################
+3.0.1 Patch Release (May 13 2025)
+#################################
+
+- Use ``nvidia-smi`` to detect the driver version and handle old drivers that don't support virtual memory. (:pr:`11391`)
+- Optimize deep trees for GPU external memory. (:pr:`11387`)
+- Small fix for page concatenation with external memory (:pr:`11338`)
+- Build xgboost-cpu for ``manylinux_2_28_x86_64`` (:pr:`11406`)
+- Workaround for different Dask versions (:pr:`11436`)
+- Output models now use denormal floating-point instead of ``nan``. (:pr:`11428`)
+- Fix aarch64 CI. (:pr:`11454`)
+
+
 ###################
 3.0.0 (2025 Feb 27)
 ###################
diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
index e4188d809bf2..c4ff53a38417 100644
--- a/ops/script/release_artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -154,6 +154,7 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     ]
     minimal_platforms = [
         "win_amd64",
+        "win_arm64",
         "manylinux2014_x86_64",
         "manylinux2014_aarch64",
         "manylinux_2_28_x86_64",

From a9644bfee348267a9b5e34012b3b8091e0b0a9bb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 1 Aug 2025 03:07:06 +0800
Subject: [PATCH 121/224] Small cleanup for the model IO. (#11602)

- Remove binary-IO related code.
- Split up the approx tree SHAP implementation.
---
 R-package/src/Makevars.in                     |   2 +-
 R-package/src/Makevars.win.in                 |   2 +-
 include/xgboost/linalg.h                      |   4 +-
 include/xgboost/tree_model.h                  | 107 +++---------------
 src/gbm/gblinear_model.cc                     |   9 +-
 src/gbm/gblinear_model.h                      |  28 +----
 src/gbm/gbtree_model.h                        |  26 +----
 src/learner.cc                                |  47 ++------
 src/predictor/cpu_predictor.cc                |   6 +-
 src/predictor/cpu_treeshap.h                  |  22 ----
 .../{cpu_treeshap.cc => treeshap.cc}          |  41 ++++++-
 src/predictor/treeshap.h                      |  32 ++++++
 src/tree/tree_model.cc                        |  70 +++++-------
 tests/cpp/tree/test_tree_model.cc             |   1 -
 14 files changed, 135 insertions(+), 262 deletions(-)
 delete mode 100644 src/predictor/cpu_treeshap.h
 rename src/predictor/{cpu_treeshap.cc => treeshap.cc} (87%)
 create mode 100644 src/predictor/treeshap.h

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 2d9ac3cd71b7..cbe8141740b6 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -85,7 +85,7 @@ OBJECTS= \
     $(PKGROOT)/src/data/iterative_dmatrix.o \
     $(PKGROOT)/src/predictor/predictor.o \
     $(PKGROOT)/src/predictor/cpu_predictor.o \
-    $(PKGROOT)/src/predictor/cpu_treeshap.o \
+    $(PKGROOT)/src/predictor/treeshap.o \
     $(PKGROOT)/src/tree/constraints.o \
     $(PKGROOT)/src/tree/param.o \
     $(PKGROOT)/src/tree/fit_stump.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 1c9a36149cf8..1a0e9c8b5bb4 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -84,7 +84,7 @@ OBJECTS= \
     $(PKGROOT)/src/data/iterative_dmatrix.o \
     $(PKGROOT)/src/predictor/predictor.o \
     $(PKGROOT)/src/predictor/cpu_predictor.o \
-    $(PKGROOT)/src/predictor/cpu_treeshap.o \
+    $(PKGROOT)/src/predictor/treeshap.o \
     $(PKGROOT)/src/tree/constraints.o \
     $(PKGROOT)/src/tree/param.o \
     $(PKGROOT)/src/tree/fit_stump.o \
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 8aa7cd8550ed..9cf13416d555 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -15,8 +15,8 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cinttypes>  // for int32_t
-#include <cstddef>    // for size_t
+#include <cstddef>  // for size_t
+#include <cstdint>  // for int32_t
 #include <limits>
 #include <string>
 #include <tuple>  // for make_tuple
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 7deb5f7d2f78..7744cb32b340 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -1,14 +1,12 @@
 /**
  * Copyright 2014-2025, XGBoost Contributors
- * \file tree_model.h
+ *
  * \brief model structure for tree
  * \author Tianqi Chen
  */
 #ifndef XGBOOST_TREE_MODEL_H_
 #define XGBOOST_TREE_MODEL_H_
 
-#include <dmlc/io.h>
-#include <dmlc/parameter.h>
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/feature_map.h>
@@ -28,67 +26,24 @@
 namespace xgboost {
 class Json;
 
-// FIXME(trivialfis): Once binary IO is gone, make this parameter internal as it should
-// not be configured by users.
-/*! \brief meta parameters of the tree */
-struct TreeParam : public dmlc::Parameter<TreeParam> {
-  /*! \brief (Deprecated) number of start root */
-  int deprecated_num_roots{1};
-  /*! \brief total number of nodes */
-  int num_nodes{1};
-  /*!\brief number of deleted nodes */
-  int num_deleted{0};
-  /*! \brief maximum depth, this is a statistics of the tree */
-  int deprecated_max_depth{0};
-  /*! \brief number of features used for tree construction */
+/** @brief meta parameters of the tree */
+struct TreeParam {
+  /** @brief The number of nodes */
+  bst_node_t num_nodes{1};
+  /** @brief The number of deleted nodes */
+  bst_node_t num_deleted{0};
+  /** @brief The number of features used for tree construction */
   bst_feature_t num_feature{0};
-  /*!
-   * \brief leaf vector size, used for vector tree
-   * used to store more than one dimensional information in tree
-   */
+  /** @brief leaf vector size. Used by the vector leaf. */
   bst_target_t size_leaf_vector{1};
-  /*! \brief reserved part, make sure alignment works for 64bit */
-  int reserved[31];
-  /*! \brief constructor */
-  TreeParam() {
-    // assert compact alignment
-    static_assert(sizeof(TreeParam) == (31 + 6) * sizeof(int), "TreeParam: 64 bit align");
-    std::memset(reserved, 0, sizeof(reserved));
-  }
-
-  // Swap byte order for all fields. Useful for transporting models between machines with different
-  // endianness (big endian vs little endian)
-  [[nodiscard]] TreeParam ByteSwap() const {
-    TreeParam x = *this;
-    dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
-    dmlc::ByteSwap(&x.num_nodes, sizeof(x.num_nodes), 1);
-    dmlc::ByteSwap(&x.num_deleted, sizeof(x.num_deleted), 1);
-    dmlc::ByteSwap(&x.deprecated_max_depth, sizeof(x.deprecated_max_depth), 1);
-    dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1);
-    dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1);
-    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
-    return x;
-  }
-
-  // declare the parameters
-  DMLC_DECLARE_PARAMETER(TreeParam) {
-    // only declare the parameters that can be set by the user.
-    // other arguments are set by the algorithm.
-    DMLC_DECLARE_FIELD(num_nodes).set_lower_bound(1).set_default(1);
-    DMLC_DECLARE_FIELD(num_feature)
-        .set_default(0)
-        .describe("Number of features used in tree construction.");
-    DMLC_DECLARE_FIELD(num_deleted).set_default(0);
-    DMLC_DECLARE_FIELD(size_leaf_vector)
-        .set_lower_bound(0)
-        .set_default(1)
-        .describe("Size of leaf vector, reserved for vector tree");
-  }
 
   bool operator==(const TreeParam& b) const {
     return num_nodes == b.num_nodes && num_deleted == b.num_deleted &&
            num_feature == b.num_feature && size_leaf_vector == b.size_leaf_vector;
   }
+
+  void FromJson(Json const& in);
+  void ToJson(Json* p_out) const;
 };
 
 /*! \brief node statistics used in regression tree */
@@ -109,16 +64,6 @@ struct RTreeNodeStat {
     return loss_chg == b.loss_chg && sum_hess == b.sum_hess &&
            base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt;
   }
-  // Swap byte order for all fields. Useful for transporting models between machines with different
-  // endianness (big endian vs little endian)
-  [[nodiscard]] RTreeNodeStat ByteSwap() const {
-    RTreeNodeStat x = *this;
-    dmlc::ByteSwap(&x.loss_chg, sizeof(x.loss_chg), 1);
-    dmlc::ByteSwap(&x.sum_hess, sizeof(x.sum_hess), 1);
-    dmlc::ByteSwap(&x.base_weight, sizeof(x.base_weight), 1);
-    dmlc::ByteSwap(&x.leaf_child_cnt, sizeof(x.leaf_child_cnt), 1);
-    return x;
-  }
 };
 
 /**
@@ -166,12 +111,11 @@ class RegTree : public Model {
    public:
     XGBOOST_DEVICE Node()  {
       // assert compact alignment
-      static_assert(sizeof(Node) == 4 * sizeof(int) + sizeof(Info),
-                    "Node: 64 bit align");
+      static_assert(sizeof(Node) == 4 * sizeof(int) + sizeof(Info), "Node: 64 bit align");
     }
-    Node(int32_t cleft, int32_t cright, int32_t parent,
-         uint32_t split_ind, float split_cond, bool default_left) :
-        parent_{parent}, cleft_{cleft}, cright_{cright} {
+    Node(int32_t cleft, int32_t cright, int32_t parent, uint32_t split_ind, float split_cond,
+         bool default_left)
+        : parent_{parent}, cleft_{cleft}, cright_{cright} {
       this->SetParent(parent_);
       this->SetSplit(split_ind, split_cond, default_left);
     }
@@ -261,16 +205,6 @@ class RegTree : public Model {
              info_.leaf_value == b.info_.leaf_value;
     }
 
-    [[nodiscard]] Node ByteSwap() const {
-      Node x = *this;
-      dmlc::ByteSwap(&x.parent_, sizeof(x.parent_), 1);
-      dmlc::ByteSwap(&x.cleft_, sizeof(x.cleft_), 1);
-      dmlc::ByteSwap(&x.cright_, sizeof(x.cright_), 1);
-      dmlc::ByteSwap(&x.sindex_, sizeof(x.sindex_), 1);
-      dmlc::ByteSwap(&x.info_, sizeof(x.info_), 1);
-      return x;
-    }
-
    private:
     /*!
      * \brief in leaf node, we have weights, in non-leaf nodes,
@@ -320,7 +254,6 @@ class RegTree : public Model {
   }
 
   RegTree() {
-    param_.Init(Args{});
     nodes_.resize(param_.num_nodes);
     stats_.resize(param_.num_nodes);
     split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
@@ -589,14 +522,6 @@ class RegTree : public Model {
     bool has_missing_;
   };
 
-  /*!
-   * \brief calculate the approximate feature contributions for the given root
-   * \param feat dense feature vector, if the feature is missing the field is set to NaN
-   * \param out_contribs output vector to hold the contributions
-   */
-  void CalculateContributionsApprox(const RegTree::FVec& feat,
-                                    std::vector<float>* mean_values,
-                                    bst_float* out_contribs) const;
   /*!
    * \brief dump the model in the requested format as a text string
    * \param fmap feature map that may help give interpretations of feature
diff --git a/src/gbm/gblinear_model.cc b/src/gbm/gblinear_model.cc
index 0be4b5a2914f..72853f544fdf 100644
--- a/src/gbm/gblinear_model.cc
+++ b/src/gbm/gblinear_model.cc
@@ -6,9 +6,7 @@
 #include "xgboost/json.h"
 #include "gblinear_model.h"
 
-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 void GBLinearModel::SaveModel(Json* p_out) const {
   auto& out = *p_out;
 
@@ -42,7 +40,4 @@ void GBLinearModel::LoadModel(Json const& in) {
     this->num_boosted_rounds = 0;
   }
 }
-
-DMLC_REGISTER_PARAMETER(DeprecatedGBLinearModelParam);
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
diff --git a/src/gbm/gblinear_model.h b/src/gbm/gblinear_model.h
index 76b73677a0d1..08ae56959461 100644
--- a/src/gbm/gblinear_model.h
+++ b/src/gbm/gblinear_model.h
@@ -14,40 +14,14 @@
 #include "xgboost/feature_map.h"
 #include "xgboost/model.h"
 #include "xgboost/json.h"
-#include "xgboost/parameter.h"
 
 namespace xgboost {
 class Json;
 namespace gbm {
-// Deprecated in 1.0.0. model parameter.  Only staying here for compatible binary model IO.
-struct DeprecatedGBLinearModelParam : public dmlc::Parameter<DeprecatedGBLinearModelParam> {
-  // number of feature dimension
-  uint32_t deprecated_num_feature;
-  // deprecated. use learner_model_param_->num_output_group.
-  int32_t deprecated_num_output_group;
-  // reserved field
-  int32_t reserved[32];
-  // constructor
-  DeprecatedGBLinearModelParam() {
-    static_assert(sizeof(*this) == sizeof(int32_t) * 34,
-                  "Model parameter size can not be changed.");
-    std::memset(this, 0, sizeof(DeprecatedGBLinearModelParam));
-  }
-
-  DMLC_DECLARE_PARAMETER(DeprecatedGBLinearModelParam) {
-    DMLC_DECLARE_FIELD(deprecated_num_feature);
-    DMLC_DECLARE_FIELD(deprecated_num_output_group);
-  }
-};
-
 // model for linear booster
 class GBLinearModel : public Model {
- private:
-  // Deprecated in 1.0.0
-  DeprecatedGBLinearModelParam param_;
-
  public:
-  int32_t num_boosted_rounds{0};
+  std::int32_t num_boosted_rounds{0};
   LearnerModelParam const* learner_model_param;
 
  public:
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index e686272c3320..bc7725e97f78 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -42,21 +42,13 @@ struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
   /**
    * \brief number of trees
    */
-  std::int32_t num_trees;
+  std::int32_t num_trees{0};
   /**
    * \brief Number of trees for a forest.
    */
-  std::int32_t num_parallel_tree;
-  /*! \brief reserved parameters */
-  int32_t reserved[38];
-
-  /*! \brief constructor */
-  GBTreeModelParam() {
-    std::memset(this, 0, sizeof(GBTreeModelParam));  // FIXME(trivialfis): Why?
-    static_assert(sizeof(GBTreeModelParam) == (4 + 2 + 2 + 32) * sizeof(int32_t),
-                  "64/32 bit compatibility issue");
-    num_parallel_tree = 1;
-  }
+  std::int32_t num_parallel_tree{1};
+
+  GBTreeModelParam() = default;
 
   // declare parameters, only declare those that need to be set.
   DMLC_DECLARE_PARAMETER(GBTreeModelParam) {
@@ -69,16 +61,6 @@ struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
             "Number of parallel trees constructed during each iteration."
             " This option is used to support boosted random forest.");
   }
-
-  // Swap byte order for all fields. Useful for transporting models between machines with different
-  // endianness (big endian vs little endian)
-  GBTreeModelParam ByteSwap() const {
-    GBTreeModelParam x = *this;
-    dmlc::ByteSwap(&x.num_trees, sizeof(x.num_trees), 1);
-    dmlc::ByteSwap(&x.num_parallel_tree, sizeof(x.num_parallel_tree), 1);
-    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
-    return x;
-  }
 };
 
 struct GBTreeModel : public Model {
diff --git a/src/learner.cc b/src/learner.cc
index 453351a21030..f2c71fc2bb71 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -41,7 +41,6 @@
 #include "common/random.h"                // for GlobalRandom
 #include "common/timer.h"                 // for Monitor
 #include "common/version.h"               // for Version
-#include "dmlc/endian.h"                  // for ByteSwap, DMLC_IO_NO_ENDIAN_SWAP
 #include "xgboost/base.h"                 // for Args, bst_float, GradientPair, bst_feature_t, ...
 #include "xgboost/context.h"              // for Context
 #include "xgboost/data.h"                 // for DMatrix, MetaInfo
@@ -84,22 +83,22 @@ T& UsePtr(T& ptr) {  // NOLINT
  */
 struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy> {
   /* \brief global bias */
-  bst_float base_score;
+  bst_float base_score{ObjFunction::DefaultBaseScore()};
   /* \brief number of features  */
-  bst_feature_t num_feature;
+  bst_feature_t num_feature{0};
   /* \brief number of classes, if it is multi-class classification  */
-  std::int32_t num_class;
+  std::int32_t num_class{0};
   /*! \brief Model contain additional properties */
-  int32_t contain_extra_attrs;
+  int32_t contain_extra_attrs{0};
   /*! \brief Model contain eval metrics */
-  int32_t contain_eval_metrics;
+  int32_t contain_eval_metrics{0};
   /*! \brief the version of XGBoost. */
-  std::uint32_t major_version;
-  std::uint32_t minor_version;
+  std::int32_t major_version{std::get<0>(Version::Self())};
+  std::int32_t minor_version{std::get<1>(Version::Self())};
   /**
    * \brief Number of target variables.
    */
-  bst_target_t num_target;
+  bst_target_t num_target{1};
   /**
    * \brief Whether we should calculate the base score from training data.
    *
@@ -110,19 +109,8 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
    *   of bool for the ease of serialization.
    */
   std::int32_t boost_from_average{true};
-  /*! \brief reserved field */
-  int reserved[25];
-  /*! \brief constructor */
-  LearnerModelParamLegacy() {
-    std::memset(this, 0, sizeof(LearnerModelParamLegacy));
-    base_score = ObjFunction::DefaultBaseScore();
-    num_target = 1;
-    major_version = std::get<0>(Version::Self());
-    minor_version = std::get<1>(Version::Self());
-    boost_from_average = true;
-    static_assert(sizeof(LearnerModelParamLegacy) == 136,
-                  "Do not change the size of this struct, as it will break binary IO.");
-  }
+
+  LearnerModelParamLegacy() = default;
 
   // Skip other legacy fields.
   [[nodiscard]] Json ToJson() const {
@@ -175,21 +163,6 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     from_chars(str.c_str(), str.c_str() + str.size(), base_score);
   }
 
-  [[nodiscard]] LearnerModelParamLegacy ByteSwap() const {
-    LearnerModelParamLegacy x = *this;
-    dmlc::ByteSwap(&x.base_score, sizeof(x.base_score), 1);
-    dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1);
-    dmlc::ByteSwap(&x.num_class, sizeof(x.num_class), 1);
-    dmlc::ByteSwap(&x.contain_extra_attrs, sizeof(x.contain_extra_attrs), 1);
-    dmlc::ByteSwap(&x.contain_eval_metrics, sizeof(x.contain_eval_metrics), 1);
-    dmlc::ByteSwap(&x.major_version, sizeof(x.major_version), 1);
-    dmlc::ByteSwap(&x.minor_version, sizeof(x.minor_version), 1);
-    dmlc::ByteSwap(&x.num_target, sizeof(x.num_target), 1);
-    dmlc::ByteSwap(&x.boost_from_average, sizeof(x.boost_from_average), 1);
-    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
-    return x;
-  }
-
   template <typename Container>
   Args UpdateAllowUnknown(Container const& kwargs) {
     // Detect whether user has made their own base score.
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 18759edd8e95..3610a58132b0 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -24,9 +24,9 @@
 #include "../data/gradient_index.h"           // for GHistIndexMatrix
 #include "../data/proxy_dmatrix.h"            // for DMatrixProxy
 #include "../gbm/gbtree_model.h"              // for GBTreeModel, GBTreeModelParam
-#include "cpu_treeshap.h"                     // for CalculateContributions
 #include "dmlc/registry.h"                    // for DMLC_REGISTRY_FILE_TAG
 #include "predict_fn.h"                       // for GetNextNode, GetNextNodeMulti
+#include "treeshap.h"                         // for CalculateContributions
 #include "xgboost/base.h"                     // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
 #include "xgboost/context.h"                  // for Context
 #include "xgboost/data.h"                     // for Entry, DMatrix, MetaInfo, SparsePage, Batch...
@@ -780,8 +780,8 @@ class CPUPredictor : public Predictor {
             CalculateContributions(*model.trees[j], feats, tree_mean_values,
                                    &this_tree_contribs[0], condition, condition_feature);
           } else {
-            model.trees[j]->CalculateContributionsApprox(
-                feats, tree_mean_values, &this_tree_contribs[0]);
+            CalculateContributionsApprox(*model.trees[j], feats, tree_mean_values,
+                                         &this_tree_contribs[0]);
           }
           for (size_t ci = 0; ci < ncolumns; ++ci) {
             p_contribs[ci] +=
diff --git a/src/predictor/cpu_treeshap.h b/src/predictor/cpu_treeshap.h
deleted file mode 100644
index 3cdbcc4a998e..000000000000
--- a/src/predictor/cpu_treeshap.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef XGBOOST_PREDICTOR_CPU_TREESHAP_H_
-#define XGBOOST_PREDICTOR_CPU_TREESHAP_H_
-/**
- * Copyright by XGBoost Contributors 2017-2022
- */
-#include <vector>                // vector
-
-#include "xgboost/tree_model.h"  // RegTree
-
-namespace xgboost {
-/**
- * \brief calculate the feature contributions (https://arxiv.org/abs/1706.06060) for the tree
- * \param feat dense feature vector, if the feature is missing the field is set to NaN
- * \param out_contribs output vector to hold the contributions
- * \param condition fix one feature to either off (-1) on (1) or not fixed (0 default)
- * \param condition_feature the index of the feature to fix
- */
-void CalculateContributions(RegTree const &tree, const RegTree::FVec &feat,
-                            std::vector<float> *mean_values, bst_float *out_contribs, int condition,
-                            unsigned condition_feature);
-}  // namespace xgboost
-#endif  // XGBOOST_PREDICTOR_CPU_TREESHAP_H_
diff --git a/src/predictor/cpu_treeshap.cc b/src/predictor/treeshap.cc
similarity index 87%
rename from src/predictor/cpu_treeshap.cc
rename to src/predictor/treeshap.cc
index 64b195d78221..100e3f4611d7 100644
--- a/src/predictor/cpu_treeshap.cc
+++ b/src/predictor/treeshap.cc
@@ -1,17 +1,46 @@
 /**
- * Copyright by XGBoost Contributors 2017-2022
+ * Copyright 2017-2025, XGBoost Contributors
  */
-#include "cpu_treeshap.h"
+#include "treeshap.h"
 
-#include <algorithm>             // copy
-#include <cinttypes>             // std::uint32_t
+#include <algorithm>  // copy
+#include <cstdint>    // std::uint32_t
 
-#include "predict_fn.h"          // GetNextNode
-#include "xgboost/base.h"        // bst_node_t
+#include "predict_fn.h"    // GetNextNode
+#include "xgboost/base.h"  // bst_node_t
 #include "xgboost/logging.h"
 #include "xgboost/tree_model.h"  // RegTree
 
 namespace xgboost {
+void CalculateContributionsApprox(RegTree const& tree, const RegTree::FVec& feat,
+                                  std::vector<float>* mean_values, float* out_contribs) {
+  CHECK_GT(mean_values->size(), 0U);
+  bst_feature_t split_index = 0;
+  // update bias value
+  float node_value = (*mean_values)[0];
+  out_contribs[feat.Size()] += node_value;
+  if (tree[0].IsLeaf()) {
+    // nothing to do anymore
+    return;
+  }
+
+  bst_node_t nid = 0;
+  auto const& cats = tree.GetCategoriesMatrix();
+
+  while (!tree[nid].IsLeaf()) {
+    split_index = tree[nid].SplitIndex();
+    nid = predictor::GetNextNode<true, true>(tree[nid], nid, feat.GetFvalue(split_index),
+                                             feat.IsMissing(split_index), cats);
+    bst_float new_value = (*mean_values)[nid];
+    // update feature weight
+    out_contribs[split_index] += new_value - node_value;
+    node_value = new_value;
+  }
+  bst_float leaf_value = tree[nid].LeafValue();
+  // update leaf feature weight
+  out_contribs[split_index] += leaf_value - node_value;
+}
+
 // Used by TreeShap
 // data we keep about our decision path
 // note that pweight is included for convenience and is not tied with the other attributes
diff --git a/src/predictor/treeshap.h b/src/predictor/treeshap.h
new file mode 100644
index 000000000000..4383b90be3a6
--- /dev/null
+++ b/src/predictor/treeshap.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2017-2025, XGBoost Contributors
+ */
+#pragma once
+
+#include <vector>  // for vector
+
+#include "xgboost/tree_model.h"  // for RegTree
+
+namespace xgboost {
+/**
+ * @brief calculate the approximate feature contributions for the given root
+ *
+ *   This follows the idea of http://blog.datadive.net/interpreting-random-forests/
+ *
+ * @param feat dense feature vector, if the feature is missing the field is set to NaN
+ * @param out_contribs output vector to hold the contributions
+ */
+void CalculateContributionsApprox(RegTree const& tree, const RegTree::FVec& feat,
+                                  std::vector<float>* mean_values, float* out_contribs);
+
+/**
+ * \brief calculate the feature contributions (https://arxiv.org/abs/1706.06060) for the tree
+ * \param feat dense feature vector, if the feature is missing the field is set to NaN
+ * \param out_contribs output vector to hold the contributions
+ * \param condition fix one feature to either off (-1) on (1) or not fixed (0 default)
+ * \param condition_feature the index of the feature to fix
+ */
+void CalculateContributions(RegTree const& tree, const RegTree::FVec& feat,
+                            std::vector<float>* mean_values, float* out_contribs, int condition,
+                            unsigned condition_feature);
+}  // namespace xgboost
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index e0978759d5e8..5773467b1db2 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -1,12 +1,12 @@
 /**
- * Copyright 2015-2024, XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file tree_model.cc
  * \brief model structure for tree
  */
+#include "xgboost/tree_model.h"
+
 #include <dmlc/json.h>
 #include <dmlc/registry.h>
-#include <xgboost/json.h>
-#include <xgboost/tree_model.h>
 
 #include <cmath>
 #include <iomanip>
@@ -16,17 +16,14 @@
 
 #include "../common/categorical.h"  // for GetNodeCats
 #include "../common/common.h"       // for EscapeU8
-#include "../predictor/predict_fn.h"
-#include "io_utils.h"  // for GetElem
+#include "io_utils.h"               // for GetElem
 #include "param.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
+#include "xgboost/json.h"
 #include "xgboost/logging.h"
 
 namespace xgboost {
-// register tree parameter
-DMLC_REGISTER_PARAMETER(TreeParam);
-
 namespace tree {
 DMLC_REGISTER_PARAMETER(TrainParam);
 }
@@ -791,6 +788,26 @@ XGBOOST_REGISTER_TREE_IO(GraphvizGenerator, "dot")
 
 constexpr bst_node_t RegTree::kRoot;
 
+void TreeParam::FromJson(Json const& in) {
+  auto const& obj = get<Object const>(in);
+  auto n_deleted_it = obj.find(StringView{"num_deleted"});
+  if (n_deleted_it != obj.cend()) {
+    // Missing in 1.0 models.
+    this->num_deleted = std::stoi(get<String const>(n_deleted_it->second));
+  }
+  this->num_feature = std::stoul(get<String const>(obj.at("num_feature")));
+  this->num_nodes = std::stoi(get<String const>(obj.at("num_nodes")));
+  this->size_leaf_vector = std::stoul(get<String const>(obj.at("size_leaf_vector")));
+}
+
+void TreeParam::ToJson(Json* p_out) const {
+  auto& out = *p_out;
+  out["num_deleted"] = std::to_string(this->num_deleted);
+  out["num_feature"] = std::to_string(this->num_feature);
+  out["num_nodes"] = std::to_string(this->num_nodes);
+  out["size_leaf_vector"] = std::to_string(this->size_leaf_vector);
+}
+
 std::string RegTree::DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const {
   if (this->IsMultiTarget() && format != "dot") {
     LOG(FATAL) << format << " tree dump " << MTNotImplemented();
@@ -1073,7 +1090,7 @@ void RegTree::LoadModel(Json const& in) {
   bool typed = IsA<I32Array>(in[tf::kParent]);
   auto const& in_obj = get<Object const>(in);
   // basic properties
-  FromJson(in["tree_param"], &param_);
+  param_.FromJson(in["tree_param"]);
   // categorical splits
   bool has_cat = in_obj.find("split_type") != in_obj.cend();
   if (has_cat) {
@@ -1127,7 +1144,8 @@ void RegTree::LoadModel(Json const& in) {
 void RegTree::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   // basic properties
-  out["tree_param"] = ToJson(param_);
+  out["tree_param"] = Object{};
+  param_.ToJson(&out["tree_param"]);
   // categorical splits
   this->SaveCategoricalSplit(p_out);
   // multi-target
@@ -1202,36 +1220,4 @@ void RegTree::SaveModel(Json* p_out) const {
   out[tf::kSplitCond] = std::move(conds);
   out[tf::kDftLeft] = std::move(default_left);
 }
-
-void RegTree::CalculateContributionsApprox(const RegTree::FVec &feat,
-                                           std::vector<float>* mean_values,
-                                           bst_float *out_contribs) const {
-  CHECK_GT(mean_values->size(), 0U);
-  // this follows the idea of http://blog.datadive.net/interpreting-random-forests/
-  unsigned split_index = 0;
-  // update bias value
-  bst_float node_value = (*mean_values)[0];
-  out_contribs[feat.Size()] += node_value;
-  if ((*this)[0].IsLeaf()) {
-    // nothing to do anymore
-    return;
-  }
-
-  bst_node_t nid = 0;
-  auto cats = this->GetCategoriesMatrix();
-
-  while (!(*this)[nid].IsLeaf()) {
-    split_index = (*this)[nid].SplitIndex();
-    nid = predictor::GetNextNode<true, true>((*this)[nid], nid,
-                                             feat.GetFvalue(split_index),
-                                             feat.IsMissing(split_index), cats);
-    bst_float new_value = (*mean_values)[nid];
-    // update feature weight
-    out_contribs[split_index] += new_value - node_value;
-    node_value = new_value;
-  }
-  bst_float leaf_value = (*this)[nid].LeafValue();
-  // update leaf feature weight
-  out_contribs[split_index] += leaf_value - node_value;
-}
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 93b8a0236581..27f04cb7a6b1 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -6,7 +6,6 @@
 #include "../../../src/common/bitfield.h"
 #include "../../../src/common/categorical.h"
 #include "../../../src/tree/io_utils.h"  // for DftBadValue
-#include "../filesystem.h"
 #include "../helpers.h"
 #include "xgboost/tree_model.h"
 

From 9261f05281635f64356a5ddc6a0b1ac209105c5b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 1 Aug 2025 04:56:52 +0800
Subject: [PATCH 122/224] [enc] Support trianing continuation. (#11598)

- Use the `__arrow_c_device_array__` in cuDF 25.06.
- Reuse the `feature_types` for reference encoding.
- Change the columnar schema to include a handle to the cat container.
- Support training continuation through re-coding the `DMatrix`.
- Handle invalid input type.
- Support all integer types.
---
 doc/python/python_api.rst                     |   2 +
 include/xgboost/json.h                        |  19 +-
 include/xgboost/json_io.h                     |   3 +
 python-package/pyproject.toml                 |   2 +-
 python-package/xgboost/_data_utils.py         | 553 ++++++++++++------
 python-package/xgboost/_typing.py             |   5 +-
 python-package/xgboost/callback.py            |  24 +-
 python-package/xgboost/core.py                |  49 +-
 python-package/xgboost/data.py                | 169 +++---
 python-package/xgboost/testing/__init__.py    |  13 -
 python-package/xgboost/testing/dask.py        |   5 +-
 python-package/xgboost/testing/data_iter.py   |   3 +-
 python-package/xgboost/testing/federated.py   |   4 +-
 python-package/xgboost/testing/ordinal.py     | 330 ++++++++---
 python-package/xgboost/testing/utils.py       |  32 +-
 src/c_api/c_api.cc                            |  22 +-
 src/common/column_matrix.h                    |   3 +-
 src/common/device_vector.cuh                  |   1 +
 src/common/hist_util.cuh                      |  16 +-
 src/common/json.cc                            |  12 +
 src/common/quantile.cc                        |   1 +
 src/common/type.h                             |   5 +-
 src/data/adapter.cc                           |  40 +-
 src/data/adapter.h                            | 166 ++----
 src/data/array_interface.cc                   |  40 +-
 src/data/array_interface.h                    |   2 +
 src/data/cat_container.cc                     |  32 +
 src/data/cat_container.cu                     |  72 ++-
 src/data/cat_container.cuh                    |  13 +
 src/data/cat_container.h                      |  62 +-
 src/data/columnar.h                           | 167 ++++++
 src/data/data.cc                              |   4 +-
 src/data/device_adapter.cu                    |  47 +-
 src/data/device_adapter.cuh                   | 113 ++--
 src/data/ellpack_page.cu                      |   1 +
 src/data/entry.h                              |  38 ++
 src/data/gradient_index.cc                    |   2 +
 src/data/proxy_dmatrix.cc                     |  20 +-
 src/data/proxy_dmatrix.cu                     |   7 +-
 src/data/proxy_dmatrix.cuh                    |  17 +-
 src/data/proxy_dmatrix.h                      |  28 +-
 src/data/quantile_dmatrix.cu                  |   2 +-
 src/data/simple_dmatrix.cc                    |  16 +-
 src/data/simple_dmatrix.cu                    |  27 +-
 src/data/sparse_page_dmatrix.cc               |   2 +-
 src/encoder/ordinal.cuh                       |  21 +-
 src/encoder/ordinal.h                         |  23 +-
 src/gbm/gbtree_model.h                        |   1 +
 src/predictor/cpu_predictor.cc                |  43 +-
 src/predictor/gpu_predictor.cu                |  24 +-
 src/predictor/predict_fn.h                    |  42 +-
 src/tree/gpu_hist/evaluate_splits.cu          |   4 -
 tests/cpp/data/test_cat_container.h           |   2 +-
 tests/cpp/test_serialization.cc               |  12 +
 .../test_device_quantile_dmatrix.py           |   7 +-
 tests/python-gpu/test_gpu_ordinal.py          |  29 +-
 tests/python/test_dmatrix.py                  |   3 +-
 tests/python/test_ordinal.py                  |  22 +-
 tests/python/test_quantile_dmatrix.py         |   2 +-
 tests/python/test_with_pandas.py              |   7 +-
 tests/python/test_with_polars.py              |  18 +-
 tests/python/test_with_scipy.py               |   6 +-
 62 files changed, 1659 insertions(+), 798 deletions(-)
 create mode 100644 src/data/columnar.h
 create mode 100644 src/data/entry.h

diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index 5398fb5d091f..12516339540b 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -43,6 +43,8 @@ Core Data Structure
     :members:
     :show-inheritance:
 
+.. autoclass:: xgboost.core.Categories
+
 Learning API
 ------------
 .. automodule:: xgboost.training
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index ddf460377f21..9eee6b393ce3 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -45,8 +45,11 @@ class Value {
     kI8Array = 9,
     kU8Array = 10,
     kI16Array = 11,
-    kI32Array = 12,
-    kI64Array = 13
+    kU16Array = 12,
+    kI32Array = 13,
+    kU32Array = 14,
+    kI64Array = 15,
+    kU64Array = 16,
   };
 
   explicit Value(ValueKind _kind) : kind_{_kind} {}
@@ -192,14 +195,26 @@ using U8Array = JsonTypedArray<std::uint8_t, Value::ValueKind::kU8Array>;
  * @brief Typed UBJSON array for int16_t.
  */
 using I16Array = JsonTypedArray<std::int16_t, Value::ValueKind::kI16Array>;
+/**
+ * @brief Typed UBJSON array for uint16_t.
+ */
+using U16Array = JsonTypedArray<std::uint16_t, Value::ValueKind::kU16Array>;
 /**
  * @brief Typed UBJSON array for int32_t.
  */
 using I32Array = JsonTypedArray<std::int32_t, Value::ValueKind::kI32Array>;
+/**
+ * @brief Typed UBJSON array for uint32_t.
+ */
+using U32Array = JsonTypedArray<std::uint32_t, Value::ValueKind::kU32Array>;
 /**
  * @brief Typed UBJSON array for int64_t.
  */
 using I64Array = JsonTypedArray<std::int64_t, Value::ValueKind::kI64Array>;
+/**
+ * @brief Typed UBJSON array for uint64_t.
+ */
+using U64Array = JsonTypedArray<std::uint64_t, Value::ValueKind::kU64Array>;
 
 class JsonObject : public Value {
  public:
diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
index b69f807aaab8..5845c05e7c30 100644
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@@ -143,8 +143,11 @@ class JsonWriter {
   virtual void Visit(I8Array  const* arr);
   virtual void Visit(U8Array const* arr);
   virtual void Visit(I16Array const* arr);
+  virtual void Visit(U16Array const* arr);
   virtual void Visit(I32Array  const* arr);
+  virtual void Visit(U32Array  const* arr);
   virtual void Visit(I64Array  const* arr);
+  virtual void Visit(U64Array  const* arr);
   virtual void Visit(JsonObject const* obj);
   virtual void Visit(JsonNumber const* num);
   virtual void Visit(JsonInteger const* num);
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index f89b1c30dace..59d5ae204793 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -59,7 +59,7 @@ follow_imports = "silent"
 
 [tool.pylint.main]
 ignore = ["tests"]
-extension-pkg-whitelist = ["numpy"]
+extension-pkg-whitelist = ["numpy", "cuda"]
 disable = [
     "import-error",
     "attribute-defined-outside-init",
diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
index f1f021ec1d5b..fcd27af494df 100644
--- a/python-package/xgboost/_data_utils.py
+++ b/python-package/xgboost/_data_utils.py
@@ -2,11 +2,13 @@
 
 import copy
 import ctypes
-import functools
 import json
+from abc import ABC, abstractmethod
+from functools import cache as fcache
 from typing import (
     TYPE_CHECKING,
     Any,
+    Callable,
     Dict,
     List,
     Literal,
@@ -14,6 +16,7 @@
     Protocol,
     Tuple,
     Type,
+    TypeAlias,
     TypedDict,
     TypeGuard,
     Union,
@@ -27,6 +30,7 @@
     ArrowCatList,
     CNumericPtr,
     DataType,
+    FeatureTypes,
     NumpyDType,
     NumpyOrCupy,
 )
@@ -45,18 +49,7 @@ def __array_interface__(self) -> "ArrayInf": ...
 
 class _CudaArrayLikeArg(Protocol):
     @property
-    def __cuda_array_interface__(self) -> "ArrayInf": ...
-
-
-class TransformedDf(Protocol):
-    """Protocol class for storing transformed dataframe."""
-
-    def array_interface(self) -> bytes:
-        """Get a JSON-encoded list of array interfaces."""
-
-    @property
-    def shape(self) -> Tuple[int, int]:
-        """Return the shape of the dataframe."""
+    def __cuda_array_interface__(self) -> "CudaArrayInf": ...
 
 
 ArrayInf = TypedDict(
@@ -71,7 +64,23 @@ def shape(self) -> Tuple[int, int]:
     },
 )
 
+CudaArrayInf = TypedDict(
+    "CudaArrayInf",
+    {
+        "data": Tuple[int, bool],
+        "typestr": str,
+        "version": Literal[3],
+        "strides": Optional[Tuple[int, ...]],
+        "shape": Tuple[int, ...],
+        "mask": Union["ArrayInf", None, _ArrayLikeArg],
+        "stream": int,
+    },
+)
+
 StringArray = TypedDict("StringArray", {"offsets": ArrayInf, "values": ArrayInf})
+CudaStringArray = TypedDict(
+    "CudaStringArray", {"offsets": CudaArrayInf, "values": CudaArrayInf}
+)
 
 
 def array_hasobject(data: DataType) -> bool:
@@ -83,14 +92,14 @@ def array_hasobject(data: DataType) -> bool:
     )
 
 
-def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> ArrayInf:
+def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> CudaArrayInf:
     """Returns a dictionary storing the CUDA array interface."""
     if array_hasobject(data):
         raise ValueError("Input data contains `object` dtype.  Expecting numeric data.")
     ainf = data.__cuda_array_interface__
     if "mask" in ainf:
         ainf["mask"] = ainf["mask"].__cuda_array_interface__  # type: ignore
-    return cast(ArrayInf, ainf)
+    return ainf
 
 
 def cuda_array_interface(data: _CudaArrayLikeArg) -> bytes:
@@ -205,12 +214,18 @@ def codes(self) -> "pd.Series": ...  # pylint: disable=missing-function-docstrin
     @property
     def dtype(self) -> np.dtype: ...  # pylint: disable=missing-function-docstring
 
+    @property
+    def values(self) -> np.ndarray: ...  # pylint: disable=missing-function-docstring
+
     def to_arrow(  # pylint: disable=missing-function-docstring
         self,
     ) -> Union["pa.StringArray", "pa.IntegerArray"]: ...
 
     @property
-    def __cuda_array_interface__(self) -> ArrayInf: ...
+    def __cuda_array_interface__(self) -> CudaArrayInf: ...
+
+    @property
+    def _column(self) -> Any: ...
 
 
 def _is_df_cat(data: Any) -> TypeGuard[DfCatAccessor]:
@@ -218,34 +233,7 @@ def _is_df_cat(data: Any) -> TypeGuard[DfCatAccessor]:
     return hasattr(data, "categories") and hasattr(data, "codes")
 
 
-@functools.cache
-def _arrow_typestr() -> Dict["pa.DataType", str]:
-    import pyarrow as pa
-
-    mapping = {
-        pa.int8(): "<i1",
-        pa.int16(): "<i2",
-        pa.int32(): "<i4",
-        pa.int64(): "<i8",
-        pa.uint8(): "<u1",
-        pa.uint16(): "<u2",
-        pa.uint32(): "<u4",
-        pa.uint64(): "<u8",
-    }
-
-    return mapping
-
-
-def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
-    """Convert a numpy string array to an arrow string array."""
-    lenarr = np.vectorize(len)
-    offsets = np.cumsum(np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)]))
-    values = strarr.sum()
-    assert "\0" not in values  # arrow string array doesn't need null terminal
-    return offsets.astype(np.int32), values
-
-
-@functools.cache
+@fcache
 def _arrow_npdtype() -> Dict[Any, Type[np.number]]:
     import pyarrow as pa
 
@@ -266,41 +254,43 @@ def _arrow_npdtype() -> Dict[Any, Type[np.number]]:
     return mapping
 
 
-def _arrow_mask_inf(mask: Optional["pa.Buffer"], size: int) -> Optional[ArrayInf]:
-    if mask is not None:
-        jmask: Optional[ArrayInf] = {
-            "data": (mask.address, True),
-            "typestr": "<t1",
+@overload
+def _arrow_buf_inf(address: int, typestr: str, size: int, stream: None) -> ArrayInf: ...
+
+
+@overload
+def _arrow_buf_inf(
+    address: int, typestr: str, size: int, stream: int
+) -> CudaArrayInf: ...
+
+
+def _arrow_buf_inf(
+    address: int, typestr: str, size: int, stream: Optional[int]
+) -> Union[ArrayInf, CudaArrayInf]:
+    if stream is not None:
+        jcuaif: CudaArrayInf = {
+            "data": (address, True),
+            "typestr": typestr,
             "version": 3,
             "strides": None,
             "shape": (size,),
             "mask": None,
+            "stream": stream,
         }
-        if not mask.is_cpu:
-            jmask["stream"] = STREAM_PER_THREAD  # type: ignore
-    else:
-        jmask = None
-    return jmask
-
+        return jcuaif
 
-def _arrow_buf_inf(buf: "pa.Buffer", typestr: str, size: int) -> ArrayInf:
-    jdata: ArrayInf = {
-        "data": (buf.address, True),
+    jaif: ArrayInf = {
+        "data": (address, True),
         "typestr": typestr,
         "version": 3,
         "strides": None,
         "shape": (size,),
         "mask": None,
     }
-    if not buf.is_cpu:
-        jdata["stream"] = STREAM_PER_THREAD  # type: ignore
-    return jdata
+    return jaif
 
 
-def _arrow_cat_inf(  # pylint: disable=too-many-locals
-    cats: "pa.StringArray",
-    codes: Union[_ArrayLikeArg, _CudaArrayLikeArg, "pa.IntegerArray"],
-) -> Tuple[StringArray, ArrayInf, Tuple]:
+def _arrow_cat_names_inf(cats: "pa.StringArray") -> Tuple[StringArray, Any]:
     if not TYPE_CHECKING:
         pa = import_pyarrow()
 
@@ -318,10 +308,8 @@ def get_n_bytes(typ: Type) -> int:
 
     if offset.size == get_n_bytes(np.int64):
         if not isinstance(cats, pa.LargeStringArray):
-            raise TypeError(
-                "Expecting `pyarrow.StringArray` or `pyarrow.LargeStringArray`,"
-                f" got: {type(cats)}."
-            )
+            arrow_str_error = "Expecting a `pyarrow.Array`."
+            raise TypeError(arrow_str_error + f" Got: {type(cats)}.")
         # Convert to 32bit integer, arrow recommends against the use of i64. Also,
         # XGBoost cannot handle large number of categories (> 2**31).
         i32cats = cats.cast(pa.string())
@@ -332,39 +320,59 @@ def get_n_bytes(typ: Type) -> int:
             "Arrow dictionary type offsets is required to be 32-bit integer."
         )
 
-    joffset = _arrow_buf_inf(offset, "<i4", off_len)
-    jdata = _arrow_buf_inf(data, "|i1", data.size)
+    joffset = _arrow_buf_inf(offset.address, "<i4", off_len, None)
+    jdata = _arrow_buf_inf(data.address, "|i1", data.size, None)
     # Categories should not have missing values.
     assert mask is None
 
     jnames: StringArray = {"offsets": joffset, "values": jdata}
+    return jnames, (mask, offset, data)
 
-    def make_array_inf(
-        array: Any,
-    ) -> Tuple[ArrayInf, Optional[Tuple[pa.Buffer, pa.Buffer]]]:
-        """Helper for handling categorical codes."""
-        # Handle cuDF data
-        if hasattr(array, "__cuda_array_interface__"):
-            inf = cuda_array_interface_dict(array)
-            return inf, None
-        if isinstance(array, pa.Array):
-            mask, data = array.buffers()
-            jdata = make_array_interface(
-                data.address,
-                shape=(len(array),),
-                dtype=_arrow_npdtype()[array.type],
-                is_cuda=not data.is_cpu,
-            )
-            jdata["mask"] = _arrow_mask_inf(mask, len(array))
-            return jdata, None
 
-        # Other types are not yet supported.
+def _arrow_array_inf(
+    array: "pa.Array",
+) -> ArrayInf:
+    """Helper for handling categorical codes."""
+    if not TYPE_CHECKING:
+        pa = import_pyarrow()
+    if not isinstance(array, pa.Array):  # pylint: disable=E0606
         raise TypeError(f"Invalid input type: {type(array)}")
 
-    cats_tmp = (mask, offset, data)
-    jcodes, codes_tmp = make_array_inf(codes)
+    mask, data = array.buffers()
+    jdata = make_array_interface(
+        data.address,
+        shape=(len(array),),
+        dtype=_arrow_npdtype()[array.type],
+        is_cuda=not data.is_cpu,
+    )
 
-    return jnames, jcodes, (cats_tmp, codes_tmp)
+    if mask is not None:
+        jmask: Optional[ArrayInf] = {
+            "data": (mask.address, True),
+            "typestr": "<t1",
+            "version": 3,
+            "strides": None,
+            "shape": (len(array),),
+            "mask": None,
+        }
+        if not mask.is_cpu:
+            jmask["stream"] = STREAM_PER_THREAD  # type: ignore
+    else:
+        jmask = None
+
+    jdata["mask"] = jmask
+    return jdata
+
+
+def arrow_cat_inf(  # pylint: disable=too-many-locals
+    cats: "pa.StringArray",
+    codes: Union[_ArrayLikeArg, _CudaArrayLikeArg, "pa.IntegerArray"],
+) -> Tuple[StringArray, ArrayInf, Tuple]:
+    """Get the array interface representation of a string-based category array."""
+    jnames, cats_tmp = _arrow_cat_names_inf(cats)
+    jcodes = _arrow_array_inf(codes)
+
+    return jnames, jcodes, (cats_tmp, None)
 
 
 def _ensure_np_dtype(
@@ -379,74 +387,8 @@ def _ensure_np_dtype(
     return data, dtype
 
 
-@overload
-def array_interface_dict(data: np.ndarray) -> ArrayInf: ...
-
-
-@overload
-def array_interface_dict(
-    data: DfCatAccessor,
-) -> Tuple[StringArray, ArrayInf, Tuple]: ...
-
-
-@overload
-def array_interface_dict(
-    data: "pa.DictionaryArray",
-) -> Tuple[StringArray, ArrayInf, Tuple]: ...
-
-
-def array_interface_dict(  # pylint: disable=too-many-locals
-    data: Union[np.ndarray, DfCatAccessor],
-) -> Union[ArrayInf, Tuple[StringArray, ArrayInf, Optional[Tuple]]]:
+def array_interface_dict(data: np.ndarray) -> ArrayInf:
     """Returns an array interface from the input."""
-    # Handle categorical values
-    if _is_df_cat(data):
-        cats = data.categories
-        # pandas uses -1 to represent missing values for categorical features
-        codes = data.codes.replace(-1, np.nan)
-
-        if np.issubdtype(cats.dtype, np.floating) or np.issubdtype(
-            cats.dtype, np.integer
-        ):
-            # Numeric index type
-            name_values = cats.values
-            jarr_values = array_interface_dict(name_values)
-            code_values = codes.values
-            jarr_codes = array_interface_dict(code_values)
-            return jarr_values, jarr_codes, (name_values, code_values)
-
-        # String index type
-        name_offsets, name_values = npstr_to_arrow_strarr(cats.values)
-        name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32)
-        joffsets = array_interface_dict(name_offsets)
-        bvalues = name_values.encode("utf-8")
-
-        ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value
-        assert ptr is not None
-
-        jvalues: ArrayInf = {
-            "data": (ptr, True),
-            "typestr": "|i1",
-            "shape": (len(name_values),),
-            "strides": None,
-            "version": 3,
-            "mask": None,
-        }
-        jnames: StringArray = {"offsets": joffsets, "values": jvalues}
-
-        code_values = codes.values
-        jcodes = array_interface_dict(code_values)
-
-        buf = (
-            name_offsets,
-            name_values,
-            bvalues,
-            code_values,
-        )  # store temporary values
-        return jnames, jcodes, buf
-
-    # Handle numeric values
-    assert isinstance(data, np.ndarray)
     if array_hasobject(data):
         raise ValueError("Input data contains `object` dtype.  Expecting numeric data.")
     ainf = data.__array_interface__
@@ -455,6 +397,62 @@ def array_interface_dict(  # pylint: disable=too-many-locals
     return cast(ArrayInf, ainf)
 
 
+def pd_cats_inf(  # pylint: disable=too-many-locals
+    cats: DfCatAccessor, codes: "pd.Series"
+) -> Tuple[Union[StringArray, ArrayInf], ArrayInf, Tuple]:
+    """Get the array interface representation of pandas category accessor."""
+    # pandas uses -1 to represent missing values for categorical features
+    codes = codes.replace(-1, np.nan)
+
+    if np.issubdtype(cats.dtype, np.floating) or np.issubdtype(cats.dtype, np.integer):
+        # Numeric index type
+        name_values_num = cats.values
+        jarr_values = array_interface_dict(name_values_num)
+        code_values = codes.values
+        jarr_codes = array_interface_dict(code_values)
+        return jarr_values, jarr_codes, (name_values_num, code_values)
+
+    def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
+        """Convert a numpy string array to an arrow string array."""
+        lenarr = np.vectorize(len)
+        offsets = np.cumsum(
+            np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)])
+        )
+        values = strarr.sum()
+        assert "\0" not in values  # arrow string array doesn't need null terminal
+        return offsets.astype(np.int32), values
+
+    # String index type
+    name_offsets, name_values = npstr_to_arrow_strarr(cats.values)
+    name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32)
+    joffsets = array_interface_dict(name_offsets)
+    bvalues = name_values.encode("utf-8")
+
+    ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value
+    assert ptr is not None
+
+    jvalues: ArrayInf = {
+        "data": (ptr, True),
+        "typestr": "|i1",
+        "shape": (len(name_values),),
+        "strides": None,
+        "version": 3,
+        "mask": None,
+    }
+    jnames: StringArray = {"offsets": joffsets, "values": jvalues}
+
+    code_values = codes.values
+    jcodes = array_interface_dict(code_values)
+
+    buf = (
+        name_offsets,
+        name_values,
+        bvalues,
+        code_values,
+    )  # store temporary values
+    return jnames, jcodes, buf
+
+
 def array_interface(data: np.ndarray) -> bytes:
     """Make array interface str."""
     interface = array_interface_dict(data)
@@ -471,21 +469,140 @@ def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None:
         raise ValueError(f"Missing value is not allowed for: {field}")
 
 
-def cudf_cat_inf(
+class ArrowSchema(ctypes.Structure):
+    """The Schema type from arrow C array."""
+
+    _fields_ = [
+        ("format", ctypes.c_char_p),
+        ("name", ctypes.c_char_p),
+        ("metadata", ctypes.c_char_p),
+        ("flags", ctypes.c_int64),
+        ("n_children", ctypes.c_int64),
+        ("children", ctypes.POINTER(ctypes.c_void_p)),
+        ("dictionary", ctypes.c_void_p),
+        ("release", ctypes.c_void_p),
+        ("private_data", ctypes.c_void_p),
+    ]
+
+
+class ArrowArray(ctypes.Structure):
+    """The Array type from arrow C array."""
+
+
+ArrowArray._fields_ = [  # pylint: disable=protected-access
+    ("length", ctypes.c_int64),
+    ("null_count", ctypes.c_int64),
+    ("offset", ctypes.c_int64),
+    ("n_buffers", ctypes.c_int64),
+    ("n_children", ctypes.c_int64),
+    ("buffers", ctypes.POINTER(ctypes.c_void_p)),
+    ("children", ctypes.POINTER(ctypes.POINTER(ArrowArray))),
+    ("dictionary", ctypes.POINTER(ArrowArray)),
+    ("release", ctypes.c_void_p),
+    ("private_data", ctypes.c_void_p),
+]
+
+
+class ArrowDeviceArray(ctypes.Structure):
+    """The Array type from arrow C device array."""
+
+    _fields_ = [
+        ("array", ArrowArray),
+        ("device_id", ctypes.c_int64),
+        ("device_type", ctypes.c_int32),
+        ("sync_event", ctypes.c_void_p),
+        ("reserved", ctypes.c_int64 * 3),
+    ]
+
+
+PyCapsule_GetName = ctypes.pythonapi.PyCapsule_GetName
+PyCapsule_GetName.restype = ctypes.c_char_p
+PyCapsule_GetName.argtypes = [ctypes.py_object]
+
+
+PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer
+PyCapsule_GetPointer.restype = ctypes.c_void_p
+PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
+
+
+def wait_event(event_hdl: int) -> None:
+    """Wait for CUDA event exported by arrow."""
+    # cuda-python is a dependency of cuDF.
+    from cuda.bindings import runtime as cudart
+
+    event = ctypes.cast(event_hdl, ctypes.POINTER(ctypes.c_int64))
+    (status,) = cudart.cudaStreamWaitEvent(
+        STREAM_PER_THREAD,
+        event.contents.value,
+        cudart.cudaEventWaitDefault,
+    )
+    if status != cudart.cudaError_t.cudaSuccess:
+        _, msg = cudart.cudaGetErrorString(status)
+        raise ValueError(msg)
+
+
+def cudf_cat_inf(  # pylint: disable=too-many-locals
     cats: DfCatAccessor, codes: "pd.Series"
-) -> Tuple[Union[ArrayInf, StringArray], ArrayInf, Tuple]:
+) -> Tuple[Union[CudaArrayInf, CudaStringArray], ArrayInf, Tuple]:
     """Obtain the cuda array interface for cuDF categories."""
     cp = import_cupy()
     is_num_idx = cp.issubdtype(cats.dtype, cp.floating) or cp.issubdtype(
         cats.dtype, cp.integer
     )
     if is_num_idx:
-        cats_ainf = cats.__cuda_array_interface__
+        cats_ainf = cuda_array_interface_dict(cats)
         codes_ainf = cuda_array_interface_dict(codes)
         return cats_ainf, codes_ainf, (cats, codes)
 
-    joffset, jdata, buf = _arrow_cat_inf(cats.to_arrow(), codes)
-    return joffset, jdata, buf
+    # pylint: disable=protected-access
+    arrow_col = cats._column.to_pylibcudf(mode="read")
+    # Tuple[types.CapsuleType, types.CapsuleType]
+    schema, array = arrow_col.__arrow_c_device_array__()
+
+    array_ptr = PyCapsule_GetPointer(array, PyCapsule_GetName(array))
+    schema_ptr = PyCapsule_GetPointer(schema, PyCapsule_GetName(schema))
+
+    # Cast to arrow array
+    arrow_device_array = ctypes.cast(
+        array_ptr, ctypes.POINTER(ArrowDeviceArray)
+    ).contents
+    wait_event(arrow_device_array.sync_event)
+    assert arrow_device_array.device_type == 2  # 2 is CUDA
+
+    arrow_array = arrow_device_array.array
+    mask, offset, data = (
+        arrow_array.buffers[0],
+        arrow_array.buffers[1],
+        arrow_array.buffers[2],
+    )
+    # Categories should not have missing values.
+    assert mask is None
+    assert arrow_array.n_children == 0
+    assert arrow_array.n_buffers == 3
+    assert arrow_array.offset == 0
+
+    # Cast to ArrowSchema
+    arrow_schema = ctypes.cast(schema_ptr, ctypes.POINTER(ArrowSchema)).contents
+    assert arrow_schema.format in (b"u", b"U", b"vu")  # utf8, large utf8
+    if arrow_schema.format in (b"u", b"vu"):
+        joffset: CudaArrayInf = _arrow_buf_inf(
+            offset, "<i4", arrow_array.length + 1, STREAM_PER_THREAD
+        )
+    elif arrow_schema.format == b"U":
+        raise TypeError("Large string for category index (names) is not supported.")
+    else:
+        raise TypeError(
+            "Unexpected type for category index. It's neither numeric nor string."
+        )
+    # 0 size for unknown
+    jdata: CudaArrayInf = _arrow_buf_inf(data, "|i1", 0, STREAM_PER_THREAD)
+    jnames: CudaStringArray = {
+        "offsets": joffset,
+        "values": jdata,
+    }
+
+    jcodes = cuda_array_interface_dict(codes)
+    return jnames, jcodes, (arrow_col,)
 
 
 class Categories:
@@ -501,17 +618,39 @@ class Categories:
 
     .. warning::
 
-        This class is still working in progress.
+        This class is internal.
+
+    .. code-block:: python
+
+        Xy = xgboost.QuantileDMatrix(X, y, enable_categorical=True)
+        booster = xgboost.train({}, Xy)
+
+        categories = booster.get_categories() # Get categories
+
+        # Use categories as a reference for re-coding
+        Xy_new = xgboost.QuantileDMatrix(
+            X_new, y_new, feature_types=categories, enable_categorical=True, ref=Xy
+        )
+
+        # Categories will be part of the `model.json`.
+        booster.save_model("model.json")
 
     """
 
     def __init__(
-        self, handle: ctypes.c_void_p, arrow_arrays: Optional[ArrowCatList]
+        self,
+        handle: Tuple[ctypes.c_void_p, Callable[[], None]],
+        arrow_arrays: Optional[ArrowCatList],
     ) -> None:
-        self._handle = handle
+        # The handle type is a bundle of the handle and the free call. Otherwise, we
+        # will have to import the lib and checkcall inside the __del__ method from the
+        # core module to avoid cyclic model dependency. Importing modules in __del__ can
+        # result in Python abort if __del__ is called during exception handling
+        # (interpreter is shutting down).
+        self._handle, self._free = handle
         self._arrow_arrays = arrow_arrays
 
-    def to_arrow(self) -> Optional[ArrowCatList]:
+    def to_arrow(self) -> ArrowCatList:
         """Get the categories in the dataset. The results are stored in a list of
         (feature name, arrow array) pairs, with one array for each categorical
         feature. If a feature is numerical, then the corresponding column in the list is
@@ -526,9 +665,77 @@ def to_arrow(self) -> Optional[ArrowCatList]:
             )
         return self._arrow_arrays
 
+    def get_handle(self) -> int:
+        """Internal method for retrieving the handle."""
+        assert self._handle.value
+        return self._handle.value
+
     def __del__(self) -> None:
-        from .core import _LIB, _check_call
+        self._free()
 
-        assert self._handle is not None
-        _check_call(_LIB.XGBCategoriesFree(self._handle))
-        del self._handle
+
+def get_ref_categories(
+    feature_types: Optional[Union[FeatureTypes, Categories]],
+) -> Tuple[Optional[FeatureTypes], Optional[Categories]]:
+    """Get the optional reference categories from the input."""
+    if isinstance(feature_types, Categories):
+        ref_categories = feature_types
+        feature_types = None
+    else:
+        ref_categories = None
+    return feature_types, ref_categories
+
+
+# Type schema for storing JSON-encoded array interface
+AifType: TypeAlias = List[
+    Union[
+        # numeric column
+        Union[ArrayInf, CudaArrayInf],
+        # categorical column
+        Tuple[
+            # (cuda) numeric index | (cuda) string index
+            Union[ArrayInf, CudaArrayInf, StringArray, CudaStringArray],
+            Union[ArrayInf, CudaArrayInf],  # codes
+        ],
+    ]
+]
+
+
+class TransformedDf(ABC):
+    """Internal class for storing transformed dataframe.
+
+    Parameters
+    ----------
+    ref_categories :
+        Optional reference categories used for re-coding.
+
+    aitfs :
+        Array interface for each column.
+
+    """
+
+    temporary_buffers: List[Tuple] = []
+
+    def __init__(self, ref_categories: Optional[Categories], aitfs: AifType) -> None:
+        self.ref_categories = ref_categories
+        if ref_categories is not None:
+            aif = ref_categories.get_handle()
+            self.ref_aif: Optional[int] = aif
+        else:
+            self.ref_aif = None
+
+        self.aitfs = aitfs
+
+    def array_interface(self) -> bytes:
+        """Return a byte string for JSON encoded array interface."""
+        if self.ref_categories is not None:
+            ref_inf: dict = {"ref_categories": self.ref_aif, "columns": self.aitfs}
+            inf = bytes(json.dumps(ref_inf), "utf-8")
+        else:
+            inf = bytes(json.dumps(self.aitfs), "utf-8")
+        return inf
+
+    @property
+    @abstractmethod
+    def shape(self) -> Tuple[int, int]:
+        """Return the shape of the dataframe."""
diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
index fc610d8eae22..dedebdef7c8a 100644
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -18,8 +18,6 @@
     Union,
 )
 
-# os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
-# cudf.DataFrame/cupy.array/dlpack
 import numpy as np
 
 DataType = Any
@@ -121,3 +119,6 @@
 # template parameter
 _T = TypeVar("_T")
 _F = TypeVar("_F", bound=Callable[..., Any])
+
+_ScoreList = Union[List[float], List[Tuple[float, float]]]
+EvalsLog: TypeAlias = Dict[str, Dict[str, _ScoreList]]
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index a39b25869e63..2242ac3c9818 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -24,6 +24,7 @@
 import numpy
 
 from . import collective
+from ._typing import EvalsLog, _ScoreList
 from .core import (
     Booster,
     DMatrix,
@@ -42,7 +43,6 @@
 ]
 
 _Score = Union[float, Tuple[float, float]]
-_ScoreList = Union[List[float], List[Tuple[float, float]]]
 
 _Model = Any  # real type is Union[Booster, CVPack]; need more work
 
@@ -56,7 +56,7 @@ class TrainingCallback(ABC):
     """
 
     # pylint: disable=invalid-name
-    EvalsLog: TypeAlias = Dict[str, Dict[str, _ScoreList]]
+    EvalsLog: TypeAlias = EvalsLog
 
     def __init__(self) -> None:
         pass
@@ -174,7 +174,7 @@ def __init__(
             raise TypeError(msg)
 
         self.metric = metric
-        self.history: TrainingCallback.EvalsLog = collections.OrderedDict()
+        self.history: EvalsLog = collections.OrderedDict()
         self._output_margin = output_margin
         self.is_cv = is_cv
 
@@ -303,9 +303,7 @@ def __init__(
             self.learning_rates = lambda epoch: cast(Sequence, learning_rates)[epoch]
         super().__init__()
 
-    def after_iteration(
-        self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
-    ) -> bool:
+    def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
         model.set_param("learning_rate", self.learning_rates(epoch))
         return False
 
@@ -374,7 +372,7 @@ def __init__(
         self.rounds = rounds
         self.save_best = save_best
         self.maximize = maximize
-        self.stopping_history: TrainingCallback.EvalsLog = {}
+        self.stopping_history: EvalsLog = {}
         self._min_delta = min_delta
         if self._min_delta < 0:
             raise ValueError("min_delta must be greater or equal to 0.")
@@ -456,9 +454,7 @@ def minimize(new: _Score, best: _Score) -> bool:
             return True
         return False
 
-    def after_iteration(
-        self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
-    ) -> bool:
+    def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
         epoch += self.starting_round  # training continuation
         msg = "Must have at least 1 validation dataset for early stopping."
         if len(evals_log.keys()) < 1:
@@ -557,9 +553,7 @@ def _fmt_metric(
             msg = f"\t{data + '-' + metric}:{score:.5f}"
         return msg
 
-    def after_iteration(
-        self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
-    ) -> bool:
+    def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
         if not evals_log:
             return False
 
@@ -638,9 +632,7 @@ def before_training(self, model: _Model) -> _Model:
         self._start = model.num_boosted_rounds()
         return model
 
-    def after_iteration(
-        self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
-    ) -> bool:
+    def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
         if self._epoch == self._iterations:
             path = os.path.join(
                 self._path,
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 0bb7f990241c..ac1bd2d5dbe0 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -345,7 +345,7 @@ def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
 def _validate_feature_info(
     feature_info: Sequence[str], n_features: int, is_column_split: bool, name: str
 ) -> List[str]:
-    if isinstance(feature_info, str) or not isinstance(feature_info, Sequence):
+    if not isinstance(feature_info, (str, Sequence, Categories)):
         raise TypeError(
             f"Expecting a sequence of strings for {name}, got: {type(feature_info)}"
         )
@@ -861,7 +861,7 @@ def __init__(
         missing: Optional[float] = None,
         silent: bool = False,
         feature_names: Optional[FeatureNames] = None,
-        feature_types: Optional[FeatureTypes] = None,
+        feature_types: Optional[Union[FeatureTypes, Categories]] = None,
         nthread: Optional[int] = None,
         group: Optional[ArrayLike] = None,
         qid: Optional[ArrayLike] = None,
@@ -904,21 +904,21 @@ def __init__(
         feature_types :
 
             Set types for features. If `data` is a DataFrame type and passing
-            `enable_categorical=True`, the types will be deduced automatically
-            from the column types.
+            `enable_categorical=True`, the types will be deduced automatically from the
+            column types.
 
-            Otherwise, one can pass a list-like input with the same length as number
-            of columns in `data`, with the following possible values:
+            Otherwise, one can pass a list-like input with the same length as number of
+            columns in `data`, with the following possible values:
 
             - "c", which represents categorical columns.
             - "q", which represents numeric columns.
             - "int", which represents integer columns.
             - "i", which represents boolean columns.
 
-            Note that, while categorical types are treated differently from
-            the rest for model fitting purposes, the other types do not influence
-            the generated model, but have effects in other functionalities such as
-            feature importances.
+            Note that, while categorical types are treated differently from the rest for
+            model fitting purposes, the other types do not influence the generated
+            model, but have effects in other functionalities such as feature
+            importances.
 
             For categorical features, the input is assumed to be preprocessed and
             encoded by the users. The encoding can be done via
@@ -926,6 +926,13 @@ def __init__(
             `.cat.codes` method. This is useful when users want to specify categorical
             features without having to construct a dataframe as input.
 
+            .. versionadded:: 3.1.0
+
+            Alternatively, user can pass a :py:class:`~xgboost.core.Categories` object
+            returned from previous training as a reference for re-coding. One can obtain
+            the reference with the :py:meth:`.get_categories` from the previous training
+            DMatrix or the Booster. This feature is experimental.
+
         nthread :
             Number of threads to use for loading data when parallelization is
             applicable. If -1, uses maximum threads available on the system.
@@ -1366,17 +1373,21 @@ def get_categories(self, export_to_arrow: bool = False) -> Categories:
         hdl = ctypes.c_void_p()
         if export_to_arrow:
             arrow_arrays = _get_categories(
-                lambda ret: _LIB.XGBDMatrixGetCategoriesExportToArrow(
-                    self.handle, ctypes.byref(hdl), ctypes.byref(ret)
+                lambda ret: _LIB.XGDMatrixGetCategoriesExportToArrow(
+                    self.handle, None, ctypes.byref(hdl), ctypes.byref(ret)
                 ),
                 fnames,
                 n_features,
             )
         else:
             arrow_arrays = None
-            _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))
+            _check_call(
+                _LIB.XGDMatrixGetCategories(self.handle, None, ctypes.byref(hdl))
+            )
 
-        return Categories(hdl, arrow_arrays)
+        return Categories(
+            (hdl, lambda: _check_call(_LIB.XGBCategoriesFree(hdl))), arrow_arrays
+        )
 
     def num_row(self) -> int:
         """Get the number of rows in the DMatrix."""
@@ -2350,16 +2361,20 @@ def get_categories(self, export_to_arrow: bool = False) -> Categories:
         if export_to_arrow:
             arrow_arrays = _get_categories(
                 lambda ret: _LIB.XGBoosterGetCategoriesExportToArrow(
-                    self.handle, ctypes.byref(hdl), ctypes.byref(ret)
+                    self.handle, None, ctypes.byref(hdl), ctypes.byref(ret)
                 ),
                 fnames,
                 n_features,
             )
         else:
             arrow_arrays = None
-            _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))
+            _check_call(
+                _LIB.XGBoosterGetCategories(self.handle, None, ctypes.byref(hdl))
+            )
 
-        return Categories(hdl, arrow_arrays)
+        return Categories(
+            (hdl, lambda: _check_call(_LIB.XGBCategoriesFree(hdl))), arrow_arrays
+        )
 
     def set_param(
         self,
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index da1457dbb07b..18ebc1579bd5 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -23,24 +23,24 @@
 import numpy as np
 
 from ._data_utils import (
-    ArrayInf,
+    AifType,
+    Categories,
     DfCatAccessor,
-    StringArray,
     TransformedDf,
-    _arrow_cat_inf,
-    _arrow_mask_inf,
-    _arrow_npdtype,
+    _arrow_array_inf,
     _ensure_np_dtype,
     _is_df_cat,
     array_hasobject,
     array_interface,
     array_interface_dict,
+    arrow_cat_inf,
     check_cudf_meta,
     cuda_array_interface,
     cuda_array_interface_dict,
     cudf_cat_inf,
+    get_ref_categories,
     is_arrow_dict,
-    make_array_interface,
+    pd_cats_inf,
 )
 from ._typing import (
     CupyT,
@@ -77,7 +77,7 @@
     from pandas import Series as PdSeries
 
 
-DispatchedDataBackendReturnType = Tuple[
+DispatchedDataBackendReturnType: TypeAlias = Tuple[
     ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]
 ]
 
@@ -611,32 +611,28 @@ class PandasTransformed(TransformedDf):
     """A storage class for transformed pandas DataFrame."""
 
     def __init__(
-        self, columns: List[Union[np.ndarray, DfCatAccessor, "pa.DictionaryType"]]
+        self,
+        columns: List[Union[np.ndarray, DfCatAccessor]],
+        ref_categories: Optional[Categories],
     ) -> None:
         self.columns = columns
 
-        aitfs = []
-        self.temporary_buffers = []
+        aitfs: AifType = []
 
         # Get the array interface representation for each column.
         for col in self.columns:
-            inf = array_interface_dict(col)
-            if isinstance(inf, tuple):
+            if _is_df_cat(col):
                 # Categorical column
-                jnames, jcodes, buf = inf
-                # Store the transformed results to avoid garbage collection.
+                jnames, jcodes, buf = pd_cats_inf(col.categories, col.codes)
                 self.temporary_buffers.append(buf)
-                aitfs.append([jnames, jcodes])
+                aitfs.append((jnames, jcodes))
             else:
+                assert isinstance(col, np.ndarray)
+                inf = array_interface_dict(col)
                 # Numeric column
                 aitfs.append(inf)
 
-        self.aitfs = aitfs
-
-    def array_interface(self) -> bytes:
-        """Return a byte string for JSON encoded array interface."""
-        sarrays = bytes(json.dumps(self.aitfs), "utf-8")
-        return sarrays
+        super().__init__(ref_categories=ref_categories, aitfs=aitfs)
 
     @property
     def shape(self) -> Tuple[int, int]:
@@ -657,18 +653,23 @@ def _transform_pandas_df(
     data: DataFrame,
     enable_categorical: bool,
     feature_names: Optional[FeatureNames] = None,
-    feature_types: Optional[FeatureTypes] = None,
+    feature_types: Optional[Union[FeatureTypes, Categories]] = None,
     meta: Optional[str] = None,
 ) -> Tuple[PandasTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
     if meta and len(data.columns) > 1 and meta not in _matrix_meta:
         raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
 
+    feature_types, ref_categories = get_ref_categories(feature_types)
     feature_names, feature_types = pandas_feature_info(
         data, meta, feature_names, feature_types, enable_categorical
     )
 
     arrays = pandas_transform_data(data)
-    return PandasTransformed(arrays), feature_names, feature_types
+    return (
+        PandasTransformed(arrays, ref_categories=ref_categories),
+        feature_names,
+        feature_types,
+    )
 
 
 def _meta_from_pandas_df(
@@ -694,7 +695,7 @@ def _from_pandas_df(
     missing: FloatCompatible,
     nthread: int,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
     data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     df, feature_names, feature_types = _transform_pandas_df(
@@ -771,23 +772,13 @@ def _from_pandas_series(
     )
 
 
-# Type for storing JSON-encoded array interface
-AifType: TypeAlias = List[
-    Union[
-        ArrayInf,  # numeric column
-        Tuple[  # categorical column
-            Union[ArrayInf, StringArray],  # string index, numeric index
-            ArrayInf,  # codes
-        ],
-    ]
-]
-
-
 class ArrowTransformed(TransformedDf):
     """A storage class for transformed arrow table."""
 
     def __init__(
-        self, columns: List[Union["pa.NumericArray", "pa.DictionaryArray"]]
+        self,
+        columns: List[Union["pa.NumericArray", "pa.DictionaryArray"]],
+        ref_categories: Optional[Categories] = None,
     ) -> None:
         self.columns = columns
 
@@ -808,33 +799,17 @@ def push_series(col: Union["pa.NumericArray", "pa.DictionaryArray"]) -> None:
                     raise TypeError(
                         "Only string-based categorical index is supported for arrow."
                     )
-                jnames, jcodes, buf = _arrow_cat_inf(cats, codes)
+                jnames, jcodes, buf = arrow_cat_inf(cats, codes)
                 self.temporary_buffers.append(buf)
                 aitfs.append((jnames, jcodes))
             else:
-                mask, data = col.buffers()
-
-                assert data.is_cpu
-                assert col.offset == 0
-
-                jdata = make_array_interface(
-                    data.address,
-                    shape=(len(col),),
-                    dtype=_arrow_npdtype()[col.type],
-                    is_cuda=not data.is_cpu,
-                )
-                jdata["mask"] = _arrow_mask_inf(mask, len(col))
+                jdata = _arrow_array_inf(col)
                 aitfs.append(jdata)
 
         for col in self.columns:
             push_series(col)
 
-        self.aitfs = aitfs
-
-    def array_interface(self) -> bytes:
-        """Return a byte string for JSON encoded array interface."""
-        sarrays = bytes(json.dumps(self.aitfs), "utf-8")
-        return sarrays
+        super().__init__(ref_categories=ref_categories, aitfs=aitfs)
 
     @property
     def shape(self) -> Tuple[int, int]:
@@ -850,7 +825,7 @@ def _transform_arrow_table(
     data: "pa.Table",
     enable_categorical: bool,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
 ) -> Tuple[ArrowTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
     if TYPE_CHECKING:
         import pyarrow as pa
@@ -858,6 +833,7 @@ def _transform_arrow_table(
         pa = import_pyarrow()
 
     t_names, t_types = _arrow_feature_info(data)
+    feature_types, ref_categories = get_ref_categories(feature_types)
 
     if feature_names is None:
         feature_names = t_names
@@ -876,7 +852,7 @@ def _transform_arrow_table(
             _invalid_dataframe_dtype(None)
         columns.append(col)
 
-    df_t = ArrowTransformed(columns)
+    df_t = ArrowTransformed(columns, ref_categories=ref_categories)
     return df_t, feature_names, feature_types
 
 
@@ -886,7 +862,7 @@ def _from_arrow_table(  # pylint: disable=too-many-positional-arguments
     missing: FloatCompatible,
     n_threads: int,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
     data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     df_t, feature_names, feature_types = _transform_arrow_table(
@@ -980,7 +956,7 @@ def _transform_polars_df(
     data: DataType,
     enable_categorical: bool,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
 ) -> Tuple[ArrowTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
     if _is_polars_lazyframe(data):
         df = data.collect()
@@ -1005,7 +981,7 @@ def _from_polars_df(  # pylint: disable=too-many-positional-arguments
     missing: FloatCompatible,
     n_threads: int,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
     data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     df_t, feature_names, feature_types = _transform_polars_df(
@@ -1065,18 +1041,20 @@ def _lazy_load_cudf_is_bool() -> Callable[[Any], bool]:
 class CudfTransformed(TransformedDf):
     """A storage class for transformed cuDF dataframe."""
 
-    def __init__(self, columns: List[Union["PdSeries", DfCatAccessor]]) -> None:
+    def __init__(
+        self,
+        columns: List[Union["PdSeries", DfCatAccessor]],
+        ref_categories: Optional[Categories],
+    ) -> None:
         self.columns = columns
         # Buffers for temporary data that cannot be freed until the data is consumed by
         # the DMatrix or the booster.
-        self.temporary_buffers: List[Tuple] = []
 
         aitfs: AifType = []
 
         def push_series(ser: Any) -> None:
             if _is_df_cat(ser):
                 cats, codes = ser.categories, ser.codes
-                cats_ainf: Union[StringArray, ArrayInf]  # string or numeric index
                 cats_ainf, codes_ainf, buf = cudf_cat_inf(cats, codes)
                 self.temporary_buffers.append(buf)
                 aitfs.append((cats_ainf, codes_ainf))
@@ -1088,12 +1066,7 @@ def push_series(ser: Any) -> None:
         for col in self.columns:
             push_series(col)
 
-        self.aitfs = aitfs
-
-    def array_interface(self) -> bytes:
-        """Return a byte string for JSON encoded array interface."""
-        sarrays = bytes(json.dumps(self.aitfs), "utf-8")
-        return sarrays
+        super().__init__(ref_categories=ref_categories, aitfs=aitfs)
 
     @property
     def shape(self) -> Tuple[int, int]:
@@ -1108,7 +1081,7 @@ def shape(self) -> Tuple[int, int]:
 def _transform_cudf_df(
     data: DataType,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
     enable_categorical: bool,
 ) -> Tuple[
     CudfTransformed,
@@ -1146,6 +1119,7 @@ def _transform_cudf_df(
             feature_names = list(data.columns.map(str))
 
     # handle feature types
+    feature_types, ref_categories = get_ref_categories(feature_types)
     if feature_types is None:
         feature_types = []
         for dtype in dtypes:
@@ -1165,16 +1139,20 @@ def _transform_cudf_df(
         else:
             result.append(data)
     else:
-        for col in data:
-            dtype = data[col].dtype
+        for col, dtype in zip(data.columns, data.dtypes):
+            series = data[col]
             if is_categorical_dtype(dtype) and enable_categorical:
-                result.append(data[col].cat)
+                result.append(series.cat)
             elif is_categorical_dtype(dtype):
                 raise ValueError(_ENABLE_CAT_ERR)
             else:
-                result.append(data[col])
+                result.append(series)
 
-    return CudfTransformed(result), feature_names, feature_types
+    return (
+        CudfTransformed(result, ref_categories=ref_categories),
+        feature_names,
+        feature_types,
+    )
 
 
 def _from_cudf_df(
@@ -1183,7 +1161,7 @@ def _from_cudf_df(
     missing: FloatCompatible,
     nthread: int,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
     enable_categorical: bool,
 ) -> DispatchedDataBackendReturnType:
     df, feature_names, feature_types = _transform_cudf_df(
@@ -1376,11 +1354,21 @@ def dispatch_data_backend(
     missing: FloatCompatible,  # Or Optional[Float]
     threads: int,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
     enable_categorical: bool = False,
     data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     """Dispatch data for DMatrix."""
+
+    def check_cats(
+        feature_types: Optional[Union[FeatureTypes, Categories]],
+    ) -> TypeGuard[Optional[FeatureTypes]]:
+        if isinstance(feature_types, Categories):
+            raise ValueError(
+                "Reference category is only supported by DataFrame inputs."
+            )
+        return True
+
     if (
         not _is_cudf_ser(data)
         and not _is_pandas_series(data)
@@ -1388,6 +1376,7 @@ def dispatch_data_backend(
     ):
         _check_data_shape(data)
     if is_scipy_csr(data):
+        assert check_cats(feature_types)
         return _from_scipy_csr(
             data=data,
             missing=missing,
@@ -1397,6 +1386,7 @@ def dispatch_data_backend(
             data_split_mode=data_split_mode,
         )
     if is_scipy_csc(data):
+        assert check_cats(feature_types)
         return _from_scipy_csc(
             data=data,
             missing=missing,
@@ -1406,6 +1396,7 @@ def dispatch_data_backend(
             data_split_mode=data_split_mode,
         )
     if is_scipy_coo(data):
+        assert check_cats(feature_types)
         return _from_scipy_csr(
             data=data.tocsr(),
             missing=missing,
@@ -1415,6 +1406,7 @@ def dispatch_data_backend(
             data_split_mode=data_split_mode,
         )
     if _is_np_array_like(data):
+        assert check_cats(feature_types)
         return _from_numpy_array(
             data=data,
             missing=missing,
@@ -1424,8 +1416,10 @@ def dispatch_data_backend(
             data_split_mode=data_split_mode,
         )
     if _is_uri(data):
+        assert check_cats(feature_types)
         return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
     if _is_list(data):
+        assert check_cats(feature_types)
         return _from_list(
             data=data,
             missing=missing,
@@ -1435,6 +1429,7 @@ def dispatch_data_backend(
             data_split_mode=data_split_mode,
         )
     if _is_tuple(data):
+        assert check_cats(feature_types)
         return _from_tuple(
             data=data,
             missing=missing,
@@ -1493,13 +1488,19 @@ def dispatch_data_backend(
             enable_categorical=enable_categorical,
         )
     if _is_cupy_alike(data):
+        assert check_cats(feature_types)
         return _from_cupy_array(data, missing, threads, feature_names, feature_types)
     if _is_cupy_csr(data):
         raise TypeError("cupyx CSR is not supported yet.")
     if _is_cupy_csc(data):
         raise TypeError("cupyx CSC is not supported yet.")
     if _is_dlpack(data):
+        assert check_cats(feature_types)
         return _from_dlpack(data, missing, threads, feature_names, feature_types)
+    if _is_modin_series(data):
+        import pandas as pd
+
+        data = pd.DataFrame(data)
     if _is_modin_df(data):
         return _from_pandas_df(
             data=data,
@@ -1509,16 +1510,9 @@ def dispatch_data_backend(
             feature_names=feature_names,
             feature_types=feature_types,
         )
-    if _is_modin_series(data):
-        return _from_pandas_series(
-            data=data,
-            missing=missing,
-            nthread=threads,
-            enable_categorical=enable_categorical,
-            feature_names=feature_names,
-            feature_types=feature_types,
-        )
+
     if _has_array_protocol(data):
+        assert check_cats(feature_types)
         array = np.asarray(data)
         return _from_numpy_array(
             data=array,
@@ -1530,6 +1524,7 @@ def dispatch_data_backend(
 
     converted = _convert_unknown_data(data)
     if converted is not None:
+        assert check_cats(feature_types)
         return _from_scipy_csr(
             data=converted,
             missing=missing,
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 8c582b647f10..8edfb0dda163 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -511,19 +511,6 @@ def non_decreasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
     return all((y - x) >= -tolerance for x, y in zip(L, L[1:]))
 
 
-def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
-    """Assert whether two DMatrices contain the same predictors."""
-    lcsr = lhs.get_data()
-    rcsr = rhs.get_data()
-    return all(
-        (
-            np.array_equal(lcsr.data, rcsr.data),
-            np.array_equal(lcsr.indices, rcsr.indices),
-            np.array_equal(lcsr.indptr, rcsr.indptr),
-        )
-    )
-
-
 M = TypeVar("M", xgb.Booster, xgb.XGBModel)
 
 
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index d22bfb560545..87396b4cb0a3 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -16,6 +16,7 @@
 from xgboost.testing.updater import get_basescore
 
 from .. import dask as dxgb
+from .._typing import EvalsLog
 from ..dask import _DASK_VERSION, _get_rabit_args
 from .data import make_batches
 from .data import make_categorical as make_cat_local
@@ -129,7 +130,7 @@ def check_external_memory(  # pylint: disable=too-many-locals
             Xy: xgb.DMatrix = xgb.ExtMemQuantileDMatrix(it, nthread=n_threads)
         else:
             Xy = xgb.DMatrix(it, nthread=n_threads)
-        results: xgb.callback.TrainingCallback.EvalsLog = {}
+        results: EvalsLog = {}
         xgb.train(
             {"tree_method": "hist", "nthread": n_threads, "device": device},
             Xy,
@@ -160,7 +161,7 @@ def check_external_memory(  # pylint: disable=too-many-locals
     else:
         Xy = xgb.DMatrix(X, yconcat, weight=wconcat, nthread=n_threads)
 
-    results_local: xgb.callback.TrainingCallback.EvalsLog = {}
+    results_local: EvalsLog = {}
     xgb.train(
         {"tree_method": "hist", "nthread": n_threads, "device": device},
         Xy,
diff --git a/python-package/xgboost/testing/data_iter.py b/python-package/xgboost/testing/data_iter.py
index 6e38c6ce5bba..371578ed9ebf 100644
--- a/python-package/xgboost/testing/data_iter.py
+++ b/python-package/xgboost/testing/data_iter.py
@@ -8,6 +8,7 @@
 
 from ..compat import import_cupy
 from ..core import DataIter, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
+from .utils import predictor_equal
 
 
 def run_mixed_sparsity(device: str) -> None:
@@ -36,7 +37,7 @@ def run_mixed_sparsity(device: str) -> None:
     y_arr = np.concatenate(y, axis=0)
     Xy_1 = QuantileDMatrix(X_arr, y_arr)
 
-    assert tm.predictor_equal(Xy_0, Xy_1)
+    assert predictor_equal(Xy_0, Xy_1)
 
 
 def check_invalid_cat_batches(device: str) -> None:
diff --git a/python-package/xgboost/testing/federated.py b/python-package/xgboost/testing/federated.py
index ddcce88c75f3..981a124b8a83 100644
--- a/python-package/xgboost/testing/federated.py
+++ b/python-package/xgboost/testing/federated.py
@@ -14,8 +14,8 @@
 import xgboost as xgb
 import xgboost.federated
 from xgboost import testing as tm
-from xgboost.training import TrainingCallback
 
+from .._typing import EvalsLog
 from ..collective import _Args as CollArgs
 
 SERVER_KEY = "server-key.pem"
@@ -80,7 +80,7 @@ def run_worker(
         num_round = 20
 
         # Run training, all the features in training API is available.
-        results: TrainingCallback.EvalsLog = {}
+        results: EvalsLog = {}
         bst = xgb.train(
             param,
             dtrain,
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 0ce93f482768..277d7696f2b2 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -1,16 +1,17 @@
 # pylint: disable=invalid-name
 """Tests for the ordinal re-coder."""
 
+import itertools
 import os
 import tempfile
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Literal, Tuple, Type, TypeVar
+from functools import cache as fcache
+from typing import Any, Tuple, Type, TypeVar
 
 import numpy as np
 import pytest
 
-from ..callback import TrainingCallback
-from ..compat import import_cupy
+from .._typing import EvalsLog
 from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
 from ..data import _lazy_load_cudf_is_cat
 from ..training import train
@@ -19,10 +20,14 @@
     is_pd_cat_dtype,
     make_batches,
     make_categorical,
+    memory,
 )
+from .updater import get_basescore
+from .utils import Device, assert_allclose, predictor_equal
 
 
-def get_df_impl(device: str) -> Tuple[Type, Type]:
+@fcache
+def get_df_impl(device: Device) -> Tuple[Type, Type]:
     """Get data frame implementation based on the ]device."""
     if device == "cpu":
         import pandas as pd
@@ -37,7 +42,7 @@ def get_df_impl(device: str) -> Tuple[Type, Type]:
     return Df, Ser
 
 
-def asarray(device: str, data: Any) -> np.ndarray:
+def asarray(device: Device, data: Any) -> np.ndarray:
     """Wrapper to get an array."""
     if device == "cpu":
         return np.asarray(data)
@@ -46,16 +51,7 @@ def asarray(device: str, data: Any) -> np.ndarray:
     return cp.asarray(data)
 
 
-def assert_allclose(device: str, a: Any, b: Any) -> None:
-    """Dispatch the assert_allclose for devices."""
-    if device == "cpu":
-        np.testing.assert_allclose(a, b)
-    else:
-        cp = import_cupy()
-        cp.testing.assert_allclose(a, b)
-
-
-def comp_booster(device: Literal["cpu", "cuda"], Xy: DMatrix, booster: str) -> None:
+def comp_booster(device: Device, Xy: DMatrix, booster: str) -> None:
     """Compare the results from DMatrix and Booster."""
     cats_dm = Xy.get_categories(export_to_arrow=True).to_arrow()
     assert cats_dm is not None
@@ -68,10 +64,10 @@ def comp_booster(device: Literal["cpu", "cuda"], Xy: DMatrix, booster: str) -> N
     assert cats_dm == cats_bst
 
 
-def run_cat_container(device: Literal["cpu", "cuda"]) -> None:
+def run_cat_container(device: Device) -> None:
     """Basic tests for the container class used by the DMatrix."""
 
-    def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
+    def run_dispatch(device: Device, DMatrixT: Type) -> None:
         Df, _ = get_df_impl(device)
         # Basic test with a single feature
         df = Df({"c": ["cdef", "abc"]}, dtype="category")
@@ -124,7 +120,7 @@ def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
 
 
 # pylint: disable=too-many-statements
-def run_cat_container_mixed(device: Literal["cpu", "cuda"]) -> None:
+def run_cat_container_mixed(device: Device) -> None:
     """Run checks with mixed types."""
     import pandas as pd
 
@@ -271,7 +267,7 @@ def run_dispatch(DMatrixT: Type) -> None:
             Xy.get_categories(export_to_arrow=False).to_arrow()
 
 
-def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
+def run_cat_container_iter(device: Device) -> None:
     """Test the categories container for iterator-based inputs."""
     n_batches = 4
     n_features = 8
@@ -304,34 +300,46 @@ def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
         assert len(v) == n_cats
 
 
-def run_cat_predict(device: Literal["cpu", "cuda"]) -> None:
-    """Basic tests for re-coding during prediction."""
+def _basic_example(device: Device) -> Tuple[Any, Any, np.ndarray, np.ndarray]:
     Df, _ = get_df_impl(device)
 
-    def run_basic(DMatrixT: Type) -> None:
-        df = Df({"c": ["cdef", "abc", "def"]}, dtype="category")
-        y = np.array([0, 1, 2])
+    enc = Df({"c": ["cdef", "abc", "def"]}, dtype="category")
+    codes = enc.c.cat.codes  # 1, 0, 2
+    assert_allclose(device, asarray(device, codes), np.array([1, 0, 2]))
+    encoded = np.array([codes.iloc[2], codes.iloc[1]])  # def, abc
+    np.testing.assert_allclose(encoded, [2, 0])
 
-        codes = df.c.cat.codes
-        encoded = np.array([codes.iloc[2], codes.iloc[1]])  # used with the next df
+    reenc = Df({"c": ["def", "abc"]}, dtype="category")  # same as `encoded`
+    codes = reenc.c.cat.codes
+    assert_allclose(device, codes, np.array([1, 0]))
 
-        Xy = DMatrixT(df, y, enable_categorical=True)
-        booster = train({"device": device}, Xy, num_boost_round=4)
+    y = np.array([0, 1, 2])
 
-        df = Df({"c": ["def", "abc"]}, dtype="category")
-        codes = df.c.cat.codes
+    return enc, reenc, encoded, y
 
-        predt0 = booster.inplace_predict(df)
-        predt1 = booster.inplace_predict(encoded)
 
-        assert_allclose(device, predt0, predt1)
+def run_basic_predict(DMatrixT: Type, device: Device, tdevice: Device) -> None:
+    """Enable tests with mixed devices."""
+    enc, reenc, encoded, y = _basic_example(device)
 
-        fmat = DMatrixT(df, enable_categorical=True)
-        predt2 = booster.predict(fmat)
-        assert_allclose(device, predt0, predt2)
+    Xy = DMatrixT(enc, y, enable_categorical=True)
+    booster = train({"device": tdevice}, Xy, num_boost_round=4)
+
+    predt0 = booster.inplace_predict(reenc)
+    predt1 = booster.inplace_predict(encoded)
+    assert_allclose(device, predt0, predt1)
+
+    fmat = DMatrixT(reenc, enable_categorical=True)
+    predt2 = booster.predict(fmat)
+    assert_allclose(device, predt0, predt2)
+
+
+def run_cat_predict(device: Device) -> None:
+    """Basic tests for re-coding during prediction."""
+    Df, _ = get_df_impl(device)
 
     for dm in (DMatrix, QuantileDMatrix):
-        run_basic(dm)
+        run_basic_predict(dm, device, device)
 
     def run_mixed(DMatrixT: Type) -> None:
         df = Df({"b": [2, 1, 3], "c": ["cdef", "abc", "def"]}, dtype="category")
@@ -365,13 +373,13 @@ def run_mixed(DMatrixT: Type) -> None:
         run_mixed(dm)
 
 
-def run_cat_invalid(device: Literal["cpu", "cuda"]) -> None:
+def run_cat_invalid(device: Device) -> None:
     """Basic tests for invalid inputs."""
-    Df, _ = get_df_impl(device)
+    Df, Ser = get_df_impl(device)
+    y = np.array([0, 1, 2])
 
     def run_invalid(DMatrixT: Type) -> None:
         df = Df({"b": [2, 1, 3], "c": ["cdef", "abc", "def"]}, dtype="category")
-        y = np.array([0, 1, 2])
 
         Xy = DMatrixT(df, y, enable_categorical=True)
         booster = train({"device": device}, Xy, num_boost_round=4)
@@ -392,8 +400,26 @@ def run_invalid(DMatrixT: Type) -> None:
     for dm in (DMatrix, QuantileDMatrix):
         run_invalid(dm)
 
+    df = Df({"b": [2, 1, 3], "c": ["cdef", "abc", "def"]}, dtype="category")
+    Xy = DMatrix(df, y, enable_categorical=True)
+    booster = train({"device": device}, Xy, num_boost_round=4)
+    df["c"] = Ser(asarray(device, [0, 1, 1]), dtype="category")
+
+    msg = "index type must match between the training and test set"
+
+    with pytest.raises(ValueError, match=msg):
+        booster.inplace_predict(df)
+
+    with pytest.raises(ValueError, match=msg):
+        DMatrix(df, enable_categorical=True, feature_types=booster.get_categories())
 
-def run_cat_thread_safety(device: Literal["cpu", "cuda"]) -> None:
+    with pytest.raises(ValueError, match=msg):
+        QuantileDMatrix(
+            df, enable_categorical=True, feature_types=booster.get_categories()
+        )
+
+
+def run_cat_thread_safety(device: Device) -> None:
     """Basic tests for thread safety."""
     X, y = make_categorical(2048, 16, 112, onehot=False, cat_ratio=0.5, device=device)
     Xy = QuantileDMatrix(X, y, enable_categorical=True)
@@ -427,44 +453,33 @@ def _make_dm(DMatrixT: Type[U], ref: DMatrix, *args: Any, **kwargs: Any) -> U:
 
 
 def _run_predt(
-    device: str,
+    device: Device,
     DMatrixT: Type,
     pred_contribs: bool,
     pred_interactions: bool,
     pred_leaf: bool,
 ) -> None:
-    Df, _ = get_df_impl(device)
-
-    df = Df({"c": ["cdef", "abc", "def"]}, dtype="category")
-    y = np.array([0, 1, 2])
+    enc, reenc, encoded, y = _basic_example(device)
 
-    codes = df.c.cat.codes
-    encoded = np.array([codes.iloc[2], codes.iloc[1]])  # used with the next df
-
-    Xy = DMatrixT(df, y, enable_categorical=True)
+    Xy = DMatrixT(enc, y, enable_categorical=True)
     booster = train({"device": device}, Xy, num_boost_round=4)
 
-    df = Df({"c": ["def", "abc"]}, dtype="category")
-    codes = df.c.cat.codes
-
-    # Contribution
-    predt0 = booster.predict(
-        _make_dm(DMatrixT, ref=Xy, data=df),
+    predt_0 = booster.predict(
+        _make_dm(DMatrixT, ref=Xy, data=reenc),
         pred_contribs=pred_contribs,
         pred_interactions=pred_interactions,
         pred_leaf=pred_leaf,
     )
-    df = Df({"c": encoded})
-    predt1 = booster.predict(
+    predt_1 = booster.predict(
         _make_dm(DMatrixT, ref=Xy, data=encoded.reshape(2, 1), feature_names=["c"]),
         pred_contribs=pred_contribs,
         pred_interactions=pred_interactions,
         pred_leaf=pred_leaf,
     )
-    assert_allclose(device, predt0, predt1)
+    assert_allclose(device, predt_0, predt_1)
 
 
-def run_cat_shap(device: Literal["cpu", "cuda"]) -> None:
+def run_cat_shap(device: Device) -> None:
     """Basic tests for SHAP values."""
 
     for dm in (DMatrix, QuantileDMatrix):
@@ -478,7 +493,7 @@ def run_cat_shap(device: Literal["cpu", "cuda"]) -> None:
         )
 
 
-def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
+def run_cat_leaf(device: Device) -> None:
     """Basic tests for leaf prediction."""
     # QuantileDMatrix is not supported by leaf.
     _run_predt(
@@ -487,7 +502,8 @@ def run_cat_leaf(device: Literal["cpu", "cuda"]) -> None:
 
 
 # pylint: disable=too-many-locals
-def make_recoded(device: Literal["cpu", "cuda"]) -> Tuple:
+@memory.cache
+def make_recoded(device: Device) -> Tuple:
     """Synthesize a test dataset with changed encoding."""
     Df, _ = get_df_impl(device)
 
@@ -542,7 +558,7 @@ def make_recoded(device: Literal["cpu", "cuda"]) -> Tuple:
 
 
 def run_specified_cat(  # pylint: disable=too-many-locals
-    device: Literal["cpu", "cuda"],
+    device: Device,
 ) -> None:
     """Run with manually specified category encoding."""
     import pandas as pd
@@ -606,14 +622,14 @@ def run_specified_cat(  # pylint: disable=too-many-locals
     assert_allclose(device, predt0, predt3)
 
 
-def run_validation(device: Literal["cpu", "cuda"]) -> None:
-    """CHeck the validation dataset is using the correct encoding."""
+def run_validation(device: Device) -> None:
+    """Check the validation dataset is using the correct encoding."""
     enc, reenc, y, _, _ = make_recoded(device)
 
     Xy = DMatrix(enc, y, enable_categorical=True)
     Xy_valid = DMatrix(reenc, y, enable_categorical=True)
 
-    evals_result: TrainingCallback.EvalsLog = {}
+    evals_result: EvalsLog = {}
     train(
         {"device": device},
         Xy,
@@ -625,3 +641,185 @@ def run_validation(device: Literal["cpu", "cuda"]) -> None:
     assert_allclose(
         device, evals_result["Train"]["rmse"], evals_result["Valid"]["rmse"]
     )
+
+
+def run_recode_dmatrix(device: Device) -> None:
+    """Test re-coding inpput for DMatrix."""
+    import pandas as pd
+
+    Df, _ = get_df_impl(device)
+
+    # String index
+    old_cats = ["a", "b", "c", "d"]
+    new_cats = ["b", "a", "c", "d"]
+
+    col0 = np.arange(0, 9)
+    col1 = pd.Categorical.from_codes(
+        # b, b, c, d, a, c, c, d, a
+        categories=old_cats,
+        codes=[1, 1, 2, 3, 0, 2, 2, 3, 0],
+    )
+    df = Df({"f0": col0, "f1": col1})
+
+    Xy = DMatrix(df, enable_categorical=True)
+    cats_0 = Xy.get_categories(export_to_arrow=True)
+
+    col1 = pd.Categorical.from_codes(
+        # b, b, c, d, a, c, c, d, a
+        categories=new_cats,
+        codes=[0, 0, 2, 3, 1, 2, 2, 3, 1],
+    )
+    df = Df({"f0": col0, "f1": col1})
+    Xy = DMatrix(df, enable_categorical=True, feature_types=cats_0)
+    cats_1 = Xy.get_categories(export_to_arrow=True)
+    assert cats_0.to_arrow() == cats_1.to_arrow()
+
+    # Numeric index
+    col0 = pd.Categorical.from_codes(
+        categories=[5, 6, 7, 8],
+        codes=[0, 0, 2, 3, 1, 2, 2, 3, 1],
+    )
+    Df, _ = get_df_impl(device)
+    df = Df({"cat": col0})
+    for DMatrixT in (DMatrix, QuantileDMatrix):
+        Xy = DMatrixT(df, enable_categorical=True)
+        cats_0 = Xy.get_categories(export_to_arrow=True)
+        assert cats_0 is not None
+
+        Xy = DMatrixT(df, enable_categorical=True, feature_types=cats_0)
+        cats_1 = Xy.get_categories(export_to_arrow=True)
+        assert cats_1 is not None
+
+        assert cats_0.to_arrow() == cats_1.to_arrow()
+
+    # Recode
+    for DMatrixT in (DMatrix, QuantileDMatrix):
+        enc, reenc, y, _, _ = make_recoded(device)
+        Xy_0 = DMatrixT(enc, y, enable_categorical=True)
+        cats_0 = Xy_0.get_categories(export_to_arrow=True)
+
+        assert cats_0 is not None
+
+        Xy_1 = DMatrixT(reenc, y, feature_types=cats_0, enable_categorical=True)
+        cats_1 = Xy_1.get_categories(export_to_arrow=True)
+        assert cats_1 is not None
+
+        assert cats_0.to_arrow() == cats_1.to_arrow()
+        assert predictor_equal(Xy_0, Xy_1)
+
+
+def run_training_continuation(device: Device) -> None:
+    """Test re-coding for training continuation."""
+    enc, reenc, y, _, _ = make_recoded(device)
+
+    def check(Xy_0: DMatrix, Xy_1: DMatrix) -> None:
+        params = {"device": device}
+
+        r = 2
+        evals_result_0: EvalsLog = {}
+        booster_0 = train(
+            params,
+            Xy_0,
+            evals=[(Xy_1, "Valid")],
+            num_boost_round=r,
+            evals_result=evals_result_0,
+        )
+        evals_result_1: EvalsLog = {}
+        booster_1 = train(
+            params,
+            Xy_1,
+            evals=[(Xy_1, "Valid")],
+            xgb_model=booster_0,
+            num_boost_round=r,
+            evals_result=evals_result_1,
+        )
+        assert get_basescore(booster_0) == get_basescore(booster_1)
+
+        evals_result_2: EvalsLog = {}
+        booster_2 = train(
+            params,
+            Xy_0,
+            evals=[(Xy_1, "Valid")],
+            num_boost_round=r * 2,
+            evals_result=evals_result_2,
+        )
+        # Check evaluation results
+        eval_concat = evals_result_0["Valid"]["rmse"] + evals_result_1["Valid"]["rmse"]
+        eval_full = evals_result_2["Valid"]["rmse"]
+        np.testing.assert_allclose(eval_full, eval_concat)
+
+        # Test inference
+        for a, b in itertools.product([enc, reenc], [enc, reenc]):
+            predt_0 = booster_1.inplace_predict(a)
+            predt_1 = booster_2.inplace_predict(b)
+            assert_allclose(device, predt_0, predt_1, rtol=1e-5)
+
+        # With DMatrix
+        for a, b in itertools.product([Xy_0, Xy_1], [Xy_0, Xy_1]):
+            predt_0 = booster_1.predict(a)
+            predt_1 = booster_2.predict(b)
+            assert_allclose(device, predt_0, predt_1, rtol=1e-5)
+
+    for Train, Valid in itertools.product(
+        [DMatrix, QuantileDMatrix], [DMatrix, QuantileDMatrix]
+    ):
+        Xy_0 = Train(enc, y, enable_categorical=True)
+        if Valid is QuantileDMatrix:
+            Xy_1 = Valid(
+                reenc,
+                y,
+                enable_categorical=True,
+                feature_types=Xy_0.get_categories(),
+                ref=Xy_0,
+            )
+        else:
+            Xy_1 = Valid(
+                reenc, y, enable_categorical=True, feature_types=Xy_0.get_categories()
+            )
+        check(Xy_0, Xy_1)
+
+
+def run_update(device: Device) -> None:
+    """Test with individual updaters."""
+    enc, reenc, y, _, _ = make_recoded(device)
+    Xy = DMatrix(enc, y, enable_categorical=True)
+    booster_0 = train({"device": device}, Xy, num_boost_round=4)
+    model_0 = booster_0.save_raw()
+    cats_0 = booster_0.get_categories()
+
+    Xy_1 = DMatrix(reenc, y, feature_types=cats_0, enable_categorical=True)
+
+    booster_1 = train(
+        {
+            "device": device,
+            "updater": "prune",
+            "process_type": "update",
+        },
+        Xy_1,
+        num_boost_round=4,
+        xgb_model=booster_0,
+    )
+    model_1 = booster_1.save_raw()
+
+    assert model_0 == model_1  # also compares the cat container inside
+
+
+def run_recode_dmatrix_predict(device: Device) -> None:
+    """Run prediction with re-coded DMatrix."""
+    enc, reenc, y, _, _ = make_recoded(device)
+
+    for DMatrixT in (DMatrix, QuantileDMatrix):
+        Xy = DMatrix(enc, y, enable_categorical=True)
+        booster = train({"device": device}, Xy, num_boost_round=4)
+        cats_0 = booster.get_categories()
+
+        Xy_1 = _make_dm(DMatrixT, Xy, reenc, y, feature_types=cats_0)
+        Xy_2 = _make_dm(DMatrixT, Xy, reenc, y)
+
+        predt_0 = booster.predict(Xy)
+        predt_1 = booster.predict(Xy_1)
+        predt_2 = booster.predict(Xy_2)
+        predt_3 = booster.inplace_predict(enc)
+
+        for predt in (predt_1, predt_2, predt_3):
+            assert_allclose(device, predt_0, predt)
diff --git a/python-package/xgboost/testing/utils.py b/python-package/xgboost/testing/utils.py
index 921173f50023..be1b990fd036 100644
--- a/python-package/xgboost/testing/utils.py
+++ b/python-package/xgboost/testing/utils.py
@@ -1,5 +1,35 @@
 """Helpers for test code."""
 
-from typing import Literal, TypeAlias
+from typing import Any, Literal, TypeAlias
+
+import numpy as np
+
+from ..compat import import_cupy
+from ..core import DMatrix
+from ..data import _is_cupy_alike
 
 Device: TypeAlias = Literal["cpu", "cuda"]
+
+
+def assert_allclose(
+    device: Device, a: Any, b: Any, *, rtol: float = 1e-7, atol: float = 0
+) -> None:
+    """Dispatch the assert_allclose for devices."""
+    if device == "cpu" and not _is_cupy_alike(a) and not _is_cupy_alike(b):
+        np.testing.assert_allclose(a, b, atol=atol, rtol=rtol)
+    else:
+        cp = import_cupy()
+        cp.testing.assert_allclose(a, b, atol=atol, rtol=rtol)
+
+
+def predictor_equal(lhs: DMatrix, rhs: DMatrix) -> bool:
+    """Assert whether two DMatrices contain the same predictors."""
+    lcsr = lhs.get_data()
+    rcsr = rhs.get_data()
+    return all(
+        (
+            np.array_equal(lcsr.data, rcsr.data),
+            np.array_equal(lcsr.indices, rcsr.indices),
+            np.array_equal(lcsr.indptr, rcsr.indptr),
+        )
+    )
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index c96e47dcfbb1..1443eebe08a7 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -21,7 +21,7 @@
 #include "../common/hist_util.h"         // for HistogramCuts
 #include "../common/io.h"                // for FileExtension, LoadSequentialFile, MemoryBuf...
 #include "../common/threading_utils.h"   // for OmpGetNumThreads, ParallelFor
-#include "../data/adapter.h"             // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
+#include "../data/adapter.h"             // for ArrayAdapter, DenseAdapter
 #include "../data/batch_utils.h"         // for MatchingPageBytes, CachePageRatio
 #include "../data/cat_container.h"       // for CatContainer
 #include "../data/ellpack_page.h"        // for EllpackPage
@@ -757,7 +757,7 @@ typedef  void * CategoriesHandle;  // NOLINT
 /**
  * Fetching categories is experimental (3.1), C functions are hidden at the moment.
  *
- * No actual container method is exposed through the C API. It's just an opaque handler at
+ * No actual container method is exposed through the C API. It's just an opaque handle at
  * the moment. This way we get to reuse the methods and the context from the DMatrix and
  * Booster.
  */
@@ -769,7 +769,8 @@ typedef  void * CategoriesHandle;  // NOLINT
  *
  * @return 0 when success, -1 when failure happens.
  */
-XGB_DLL int XGBDMatrixGetCategories(DMatrixHandle handle, CategoriesHandle *out) {
+XGB_DLL int XGDMatrixGetCategories(DMatrixHandle handle, char const * /*config*/,
+                                   CategoriesHandle *out) {
   API_BEGIN()
   CHECK_HANDLE()
 
@@ -790,8 +791,8 @@ XGB_DLL int XGBDMatrixGetCategories(DMatrixHandle handle, CategoriesHandle *out)
  *
  * @return 0 when success, -1 when failure happens.
  */
-XGB_DLL int XGBDMatrixGetCategoriesExportToArrow(DMatrixHandle handle, CategoriesHandle *out,
-                                                 char const **export_out) {
+XGB_DLL int XGDMatrixGetCategoriesExportToArrow(DMatrixHandle handle, char const * /*config*/,
+                                                CategoriesHandle *out, char const **export_out) {
   API_BEGIN();
   CHECK_HANDLE()
 
@@ -1748,9 +1749,10 @@ XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle,
  * Experimental (3.1), hidden.
  */
 /**
- * See @ref XGBDMatrixGetCategories
+ * See @ref XGDMatrixGetCategories
  */
-XGB_DLL int XGBoosterGetCategories(DMatrixHandle handle, CategoriesHandle *out) {
+XGB_DLL int XGBoosterGetCategories(DMatrixHandle handle, char const * /*config*/,
+                                   CategoriesHandle *out) {
   API_BEGIN()
   CHECK_HANDLE()
 
@@ -1763,10 +1765,10 @@ XGB_DLL int XGBoosterGetCategories(DMatrixHandle handle, CategoriesHandle *out)
   API_END()
 }
 /**
- * See @ref XGBDMatrixGetCategoriesExportToArrow
+ * See @ref XGDMatrixGetCategoriesExportToArrow
  */
-XGB_DLL int XGBoosterGetCategoriesExportToArrow(BoosterHandle handle, CategoriesHandle *out,
-                                                char const **export_out) {
+XGB_DLL int XGBoosterGetCategoriesExportToArrow(BoosterHandle handle, char const * /*config*/,
+                                                CategoriesHandle *out, char const **export_out) {
   API_BEGIN()
   CHECK_HANDLE()
 
diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index 17f3ed4c6824..3e0fd087c492 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -15,7 +15,8 @@
 #include <memory>
 #include <type_traits>  // for enable_if_t, is_same_v, is_signed_v
 
-#include "../data/adapter.h"
+#include "../data/adapter.h"  // for SparsePageAdapterBatch
+#include "../data/entry.h"    // for IsValidFunctor
 #include "../data/gradient_index.h"
 #include "bitfield.h"  // for RBitField8
 #include "hist_util.h"
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 352ebf371d26..6fb4cfab2a85 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -32,6 +32,7 @@
 #include <cstdint>                 // for int64_t
 #include <cub/util_allocator.cuh>  // for CachingDeviceAllocator
 #include <cub/util_device.cuh>     // for CurrentDevice
+#include <functional>              // for function
 #include <memory>                  // for unique_ptr
 
 #include "common.h"         // for safe_cuda, HumanMemUnit
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 078caeee7224..878301729f53 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost contributors
+ * Copyright 2020-2025, XGBoost contributors
  *
  * \brief Front end and utilities for GPU based sketching.  Works on sliding window
  *        instead of stream.
@@ -15,9 +15,9 @@
 #include <cstdint>    // for uint32_t
 #include <limits>     // for numeric_limits
 
-#include "../data/adapter.h"  // for IsValidFunctor
-#include "algorithm.cuh"      // for CopyIf
-#include "cuda_context.cuh"   // for CUDAContext
+#include "../data/entry.h"   // for IsValidFunctor
+#include "algorithm.cuh"     // for CopyIf
+#include "cuda_context.cuh"  // for CUDAContext
 #include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
@@ -45,11 +45,7 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 
   dh::BlockFill(smem_cs_ptr, out_column_size.size(), 0);
 
-#if CUB_VERSION >= 300000
   __syncthreads();
-#else
-  cub::CTA_SYNC();
-#endif
 
   auto n = batch_iter.size();
 
@@ -60,11 +56,7 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
     }
   }
 
-#if CUB_VERSION >= 300000
   __syncthreads();
-#else
-  cub::CTA_SYNC();
-#endif
 
   auto out_global_ptr = out_column_size;
   for (auto i : dh::BlockStrideRange(static_cast<std::size_t>(0), out_column_size.size())) {
diff --git a/src/common/json.cc b/src/common/json.cc
index 26d56bb1b03e..59259e1309e1 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -41,8 +41,11 @@ auto to_i64 = [](auto v) { return Json{static_cast<int64_t>(v)}; };
 void JsonWriter::Visit(I8Array const* arr) { this->WriteArray(arr, to_i64); }
 void JsonWriter::Visit(U8Array const* arr) { this->WriteArray(arr, to_i64); }
 void JsonWriter::Visit(I16Array const* arr) { this->WriteArray(arr, to_i64); }
+void JsonWriter::Visit(U16Array const* arr) { this->WriteArray(arr, to_i64); }
 void JsonWriter::Visit(I32Array const* arr) { this->WriteArray(arr, to_i64); }
+void JsonWriter::Visit(U32Array const* arr) { this->WriteArray(arr, to_i64); }
 void JsonWriter::Visit(I64Array const* arr) { this->WriteArray(arr, to_i64); }
+void JsonWriter::Visit(U64Array const* arr) { this->WriteArray(arr, to_i64); }  // dangerous
 
 void JsonWriter::Visit(JsonObject const* obj) {
   stream_->emplace_back('{');
@@ -156,10 +159,16 @@ std::string Value::TypeStr() const {
       return "U8Array";
     case ValueKind::kI16Array:
       return "I16Array";
+    case ValueKind::kU16Array:
+      return "U16Array";
     case ValueKind::kI32Array:
       return "I32Array";
+    case ValueKind::kU32Array:
+      return "U32Array";
     case ValueKind::kI64Array:
       return "I64Array";
+    case ValueKind::kU64Array:
+      return "U64Array";
   }
   return "";
 }
@@ -276,8 +285,11 @@ template class JsonTypedArray<double, Value::ValueKind::kF64Array>;
 template class JsonTypedArray<std::int8_t, Value::ValueKind::kI8Array>;
 template class JsonTypedArray<std::uint8_t, Value::ValueKind::kU8Array>;
 template class JsonTypedArray<std::int16_t, Value::ValueKind::kI16Array>;
+template class JsonTypedArray<std::uint16_t, Value::ValueKind::kU16Array>;
 template class JsonTypedArray<std::int32_t, Value::ValueKind::kI32Array>;
+template class JsonTypedArray<std::uint32_t, Value::ValueKind::kU32Array>;
 template class JsonTypedArray<std::int64_t, Value::ValueKind::kI64Array>;
+template class JsonTypedArray<std::uint64_t, Value::ValueKind::kU64Array>;
 
 // Json Number
 bool JsonNumber::operator==(Value const& rhs) const {
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index bba5611e9cd6..415bfeb6a34d 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -114,6 +114,7 @@ INSTANTIATE(CSRArrayAdapterBatch)
 INSTANTIATE(CSCArrayAdapterBatch)
 INSTANTIATE(SparsePageAdapterBatch)
 INSTANTIATE(ColumnarAdapterBatch)
+INSTANTIATE(EncColumnarAdapterBatch)
 
 namespace {
 /**
diff --git a/src/common/type.h b/src/common/type.h
index 661a52ec1b25..59c824b687b2 100644
--- a/src/common/type.h
+++ b/src/common/type.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #pragma once
 #include <cstdint>      // for int8_t
@@ -21,4 +21,7 @@ common::Span<T> RestoreType(common::Span<U> data) {
   auto restored = common::Span{reinterpret_cast<T*>(data.data()), n_total_bytes / sizeof(T)};
   return restored;
 }
+
+template <typename T>
+using GetValueT = std::remove_cv_t<std::remove_reference_t<T>>;
 }  // namespace xgboost::common
diff --git a/src/data/adapter.cc b/src/data/adapter.cc
index 2df16b91b606..58d7e84e7e67 100644
--- a/src/data/adapter.cc
+++ b/src/data/adapter.cc
@@ -3,18 +3,41 @@
  */
 #include "adapter.h"
 
-#include <utility>  // for move
+#include <algorithm>  // for all_of
+#include <cstdint>    // for int32_t
+#include <numeric>    // for partial_sum
+#include <utility>    // for move
+#include <vector>     // for vector
 
 #include "../c_api/c_api_error.h"  // for API_BEGIN, API_END
+#include "../encoder/ordinal.h"    // for HostCatIndexView
 #include "array_interface.h"       // for ArrayInterface
-#include "xgboost/c_api.h"
+#include "columnar.h"              // for GetRefCats, GetArrowDictionary
+#include "xgboost/c_api.h"         // for DataIterHandle
+#include "xgboost/json.h"          // for Json, Object, Array
 #include "xgboost/logging.h"
 
 namespace xgboost::data {
+namespace {
+auto GetRefCats(Json handle) {
+  auto cats = reinterpret_cast<CatContainer const*>(get<Integer const>(handle));
+  CHECK(cats);
+  auto h_cats = cats->HostView();
+  return h_cats;
+}
+}  // anonymous namespace
+
 ColumnarAdapter::ColumnarAdapter(StringView columns) {
-  auto jarray = Json::Load(columns);
-  CHECK(IsA<Array>(jarray));
-  auto const& array = get<Array const>(jarray);
+  auto jdf = Json::Load(columns);
+
+  if (IsA<Object>(jdf)) {
+    // Has reference categories.
+    this->ref_cats_ = GetRefCats(jdf["ref_categories"]);
+    jdf = jdf["columns"];
+  }
+
+  CHECK(IsA<Array>(jdf));
+  auto const& array = get<Array const>(jdf);
   bst_idx_t n_samples{0};
   std::vector<std::int32_t> cat_segments{0};
   for (auto const& jcol : array) {
@@ -51,7 +74,12 @@ ColumnarAdapter::ColumnarAdapter(StringView columns) {
                                                     });
   this->cat_segments_ = std::move(cat_segments);
   CHECK(consistent) << "Size of columns should be the same.";
-  batch_ = ColumnarAdapterBatch{columns_};
+  batch_ = ColumnarAdapterBatch{columns_, NoOpAccessor{}};
+
+  if (!this->ref_cats_.Empty()) {
+    CHECK_EQ(this->ref_cats_.Size(), this->columns_.size())
+        << "Invalid reference categories, different number of columns";
+  }
 }
 
 template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 112600b91009..491823413f02 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -7,7 +7,6 @@
 #include <dmlc/data.h>
 
 #include <algorithm>  // for transform, all_of
-#include <cmath>      // for isfinite
 #include <cstddef>    // for size_t
 #include <cstdint>    // for uint8_t
 #include <limits>     // for numeric_limits
@@ -16,11 +15,9 @@
 #include <variant>    // for variant
 #include <vector>     // for vector
 
-#include "../common/error_msg.h"  // for NoFloatCat
-#include "../common/math.h"       // for CheckNAN
-#include "../encoder/ordinal.h"   // for CatStrArrayView
-#include "../encoder/types.h"     // for TupToVarT
-#include "array_interface.h"      // for CategoricalIndexArgTypes
+#include "../data/cat_container.h"  // for CatAccessor
+#include "array_interface.h"        // for ArrayInterface
+#include "entry.h"                  // for COOTuple
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/logging.h"
@@ -73,34 +70,6 @@ namespace xgboost::data {
  * passing over the data. */
 constexpr size_t kAdapterUnknownSize = std::numeric_limits<size_t >::max();
 
-struct COOTuple {
-  COOTuple() = default;
-  XGBOOST_DEVICE COOTuple(bst_idx_t row_idx, bst_idx_t column_idx, float value)
-      : row_idx(row_idx), column_idx(column_idx), value(value) {}
-
-  bst_idx_t row_idx{0};
-  bst_idx_t column_idx{0};
-  float value{0};
-};
-
-struct IsValidFunctor {
-  float missing;
-
-  XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
-
-  XGBOOST_DEVICE bool operator()(float value) const {
-    return !(common::CheckNAN(value) || value == missing);
-  }
-
-  XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
-    return !(common::CheckNAN(e.value) || e.value == missing);
-  }
-
-  XGBOOST_DEVICE bool operator()(const Entry& e) const {
-    return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
-  }
-};
-
 namespace detail {
 
 /**
@@ -413,16 +382,21 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch>
   [[nodiscard]] const CSCArrayAdapterBatch& Value() const override { return batch_; }
 };
 
-class ColumnarAdapterBatch : public detail::NoMetaInfo {
-  common::Span<ArrayInterface<1>> columns_;
+template <typename EncAccessor>
+class EncColumnarAdapterBatchImpl : public detail::NoMetaInfo {
+  using ArrayInf = std::add_const_t<ArrayInterface<1>>;
+
+  common::Span<ArrayInf> columns_;
+  EncAccessor acc_;
 
   class Line {
-    common::Span<ArrayInterface<1>> const& columns_;
+    common::Span<ArrayInf> const& columns_;
     std::size_t const ridx_;
+    EncAccessor const& acc_;
 
    public:
-    explicit Line(common::Span<ArrayInterface<1>> const& columns, std::size_t ridx)
-        : columns_{columns}, ridx_{ridx} {}
+    explicit Line(common::Span<ArrayInf> const& columns, EncAccessor const& acc, std::size_t ridx)
+        : columns_{columns}, ridx_{ridx}, acc_{acc} {}
     [[nodiscard]] std::size_t Size() const { return columns_.empty() ? 0 : columns_.size(); }
 
     [[nodiscard]] COOTuple GetElement(std::size_t fidx) const {
@@ -430,16 +404,17 @@ class ColumnarAdapterBatch : public detail::NoMetaInfo {
       float value = column.valid.Data() == nullptr || column.valid.Check(ridx_)
                         ? column(ridx_)
                         : std::numeric_limits<float>::quiet_NaN();
-      return {ridx_, fidx, value};
+      return {ridx_, fidx, acc_(value, fidx)};
     }
   };
 
  public:
-  ColumnarAdapterBatch() = default;
-  explicit ColumnarAdapterBatch(common::Span<ArrayInterface<1>> columns) : columns_{columns} {}
-  [[nodiscard]] Line GetLine(std::size_t ridx) const { return Line{columns_, ridx}; }
+  EncColumnarAdapterBatchImpl() = default;
+  explicit EncColumnarAdapterBatchImpl(common::Span<ArrayInf> columns, EncAccessor acc)
+      : columns_{columns}, acc_{std::move(acc)} {}
+  [[nodiscard]] Line GetLine(std::size_t ridx) const { return Line{columns_, this->acc_, ridx}; }
   [[nodiscard]] std::size_t Size() const {
-    return columns_.empty() ? 0 : columns_.front().Shape<0>();
+    return columns_.empty() ? 0 : columns_.front().template Shape<0>();
   }
   [[nodiscard]] std::size_t NumCols() const { return columns_.empty() ? 0 : columns_.size(); }
   [[nodiscard]] std::size_t NumRows() const { return this->Size(); }
@@ -447,93 +422,30 @@ class ColumnarAdapterBatch : public detail::NoMetaInfo {
   static constexpr bool kIsRowMajor = true;
 };
 
-/**
- * @brief Get string names and codes for categorical features.
- *
- * @return The number of categories for the current column.
- */
-template <bool allow_mask, typename CategoricalIndex>
-[[nodiscard]] std::size_t GetArrowDictionary(Json jcol,
-                                             std::vector<CategoricalIndex>* p_cat_columns,
-                                             std::vector<ArrayInterface<1, allow_mask>>* p_columns,
-                                             std::size_t* p_n_bytes, bst_idx_t* p_n_samples) {
-  auto& cat_columns = *p_cat_columns;
-  // arrow StringArray for name of categories
-  auto const& jnames = get<Object const>(jcol[0]);
-  // There are 3 buffers for a StringArray, validity mask, offset, and data. Mask
-  // and data are represented by a single masked array.
-  auto const& joffset = get<Object const>(jnames.at("offsets"));
-  auto offset = ArrayInterface<1>{joffset};
-  auto const& jstr = get<Object const>(jnames.at("values"));
-  auto strbuf = ArrayInterface<1>(jstr);
-  CHECK_EQ(strbuf.type, ArrayInterfaceHandler::kI1);
-  CHECK_EQ(offset.type, ArrayInterfaceHandler::kI4);
-  auto names = enc::CatStrArrayView{
-      common::Span{static_cast<std::int32_t const*>(offset.data), offset.Shape<0>()},
-      common::Span<std::int8_t const>{reinterpret_cast<std::int8_t const*>(strbuf.data), strbuf.n}};
-  cat_columns.emplace_back(names);
-
-  // arrow Integer array for encoded categories
-  auto const& jcodes = get<Object const>(jcol[1]);
-  auto codes = ArrayInterface<1>{jcodes};
-  p_columns->push_back(codes);
-
-  auto& n_bytes = *p_n_bytes;
-  n_bytes += codes.ElementSize() * codes.Shape<0>();
-  n_bytes += names.SizeBytes();
-
-  *p_n_samples = std::max(*p_n_samples, static_cast<bst_idx_t>(codes.Shape<0>()));
-  return names.size();
-}
-
-/**
- * @brief Get numeric names and codes for categorical features.
- *
- * @return The number of categories for the current column.
- */
-template <typename CategoricalIndex, bool allow_mask>
-[[nodiscard]] std::size_t GetArrowNumericIndex(
-    DeviceOrd device, Json jcol, std::vector<CategoricalIndex>* p_cat_columns,
-    std::vector<ArrayInterface<1, allow_mask>>* p_columns, std::size_t* p_n_bytes,
-    bst_idx_t* p_n_samples) {
-  auto const& first = get<Object const>(jcol[0]);
-  auto names = ArrayInterface<1>{first};
-  auto& n_bytes = *p_n_bytes;
-  DispatchDType(names, device, [&](auto t) {
-    using T = typename decltype(t)::value_type;
-    constexpr bool kKnownType = enc::MemberOf<std::remove_cv_t<T>, enc::CatPrimIndexTypes>::value;
-    CHECK(kKnownType) << "Unsupported categorical index type.";
-    if constexpr (std::is_floating_point_v<T>) {
-      LOG(FATAL) << error::NoFloatCat();
-    }
-    auto span = common::Span{t.Values().data(), t.Size()};
-    if constexpr (kKnownType) {
-      p_cat_columns->emplace_back(span);
-      n_bytes += span.size_bytes();
-    }
-  });
-  auto const& jcodes = get<Object const>(jcol[1]);
-  auto codes = ArrayInterface<1>{jcodes};
-  p_columns->push_back(codes);
-
-  n_bytes += codes.ElementSize() * codes.Shape<0>();
-  *p_n_samples = std::max(*p_n_samples, static_cast<bst_idx_t>(codes.Shape<0>()));
-
-  return names.n;
-}
+using ColumnarAdapterBatch = EncColumnarAdapterBatchImpl<NoOpAccessor>;
+using EncColumnarAdapterBatch = EncColumnarAdapterBatchImpl<CatAccessor>;
 
 /**
  * @brief Adapter for columnar format (arrow).
  *
  *   Supports both numeric values and categorical values.
+ *
+ * See @ref XGDMatrixCreateFromColumnar for notes
  */
 class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch> {
   std::vector<ArrayInterface<1>> columns_;
+  enc::HostColumnsView ref_cats_;
   std::vector<enc::HostCatIndexView> cats_;
   std::vector<std::int32_t> cat_segments_;
   ColumnarAdapterBatch batch_;
   std::size_t n_bytes_{0};
 
+  [[nodiscard]] static bool HasCatImpl(std::vector<enc::HostCatIndexView> const& cats) {
+    return !std::all_of(cats.cbegin(), cats.cend(), [](auto const& cats) {
+      return std::visit([](auto&& cats) { return cats.empty(); }, cats);
+    });
+  }
+
  public:
   /**
    * @brief JSON-encoded array of columns.
@@ -549,19 +461,27 @@ class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch>
     return 0;
   }
   [[nodiscard]] bst_idx_t NumColumns() const { return columns_.size(); }
-  [[nodiscard]] bool HasCategorical() const {
-    return !std::all_of(this->cats_.cbegin(), this->cats_.cend(), [](auto const& cats) {
-      return std::visit([](auto&& cats) { return cats.empty(); }, cats);
-    });
-  }
+
+  [[nodiscard]] bool HasCategorical() const { return HasCatImpl(this->cats_); }
+  [[nodiscard]] bool HasRefCategorical() const { return !this->ref_cats_.Empty(); }
+
   [[nodiscard]] std::size_t SizeBytes() const { return n_bytes_; }
 
   [[nodiscard]] enc::HostColumnsView Cats() const {
     return {this->cats_, this->cat_segments_,
             static_cast<std::int32_t>(this->cat_segments_.back())};
   }
+  [[nodiscard]] enc::HostColumnsView RefCats() const { return this->ref_cats_; }
+  [[nodiscard]] common::Span<ArrayInterface<1> const> Columns() const { return this->columns_; }
 };
 
+inline auto MakeEncColumnarBatch(Context const* ctx, ColumnarAdapter const* adapter) {
+  auto cats = std::make_unique<CatContainer>(adapter->RefCats());
+  cats->Sort(ctx);
+  auto [acc, mapping] = cpu_impl::MakeCatAccessor(ctx, adapter->Cats(), cats.get());
+  return std::tuple{EncColumnarAdapterBatch{adapter->Columns(), acc}, std::move(mapping)};
+}
+
 class FileAdapterBatch {
  public:
   class Line {
diff --git a/src/data/array_interface.cc b/src/data/array_interface.cc
index 06b9ed00c870..8240387256d9 100644
--- a/src/data/array_interface.cc
+++ b/src/data/array_interface.cc
@@ -1,11 +1,49 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include "array_interface.h"
 
+#if !defined(XGBOOST_USE_CUDA)
+
 #include "../common/common.h"  // for AssertGPUSupport
 
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 namespace xgboost {
+std::string ArrayInterfaceHandler::TypeStr(Type type) {
+  auto name_fn = [](std::int32_t bits, char t) {
+    return std::to_string(bits) + "-bit " + ArrayInterfaceErrors::TypeStr(t);
+  };
+  switch (type) {
+    case kF2:
+      return name_fn(16, 'f');
+    case kF4:
+      return name_fn(32, 'f');
+    case kF8:
+      return name_fn(64, 'f');
+    case kF16:
+      return name_fn(128, 'f');
+    case kI1:
+      return name_fn(8, 'i');
+    case kI2:
+      return name_fn(16, 'i');
+    case kI4:
+      return name_fn(32, 'i');
+    case kI8:
+      return name_fn(64, 'i');
+    case kU1:
+      return name_fn(8, 'u');
+    case kU2:
+      return name_fn(16, 'u');
+    case kU4:
+      return name_fn(32, 'u');
+    case kU8:
+      return name_fn(64, 'u');
+  }
+  LOG(FATAL) << "unreachable";
+  return {};
+}
+
 #if !defined(XGBOOST_USE_CUDA)
 void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
 bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index 35056b74f3aa..ce393c0fffa4 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -116,6 +116,8 @@ class ArrayInterfaceHandler {
     kU8 = 11,
   };
 
+  static std::string TypeStr(Type type);
+
   template <typename PtrType>
   static PtrType GetPtrFromArrayData(Object::Map const &obj) {
     auto data_it = obj.find("data");
diff --git a/src/data/cat_container.cc b/src/data/cat_container.cc
index ecfa0e23f893..ae93e6b25466 100644
--- a/src/data/cat_container.cc
+++ b/src/data/cat_container.cc
@@ -70,6 +70,22 @@ namespace {
 template <typename T>
 struct PrimToUbj;
 
+template <>
+struct PrimToUbj<std::uint8_t> {
+  using Type = U8Array;
+};
+template <>
+struct PrimToUbj<std::uint16_t> {
+  using Type = U16Array;
+};
+template <>
+struct PrimToUbj<std::uint32_t> {
+  using Type = U32Array;
+};
+template <>
+struct PrimToUbj<std::uint64_t> {
+  using Type = U64Array;
+};
 template <>
 struct PrimToUbj<std::int8_t> {
   using Type = I8Array;
@@ -193,18 +209,34 @@ void CatContainer::Load(Json const& in) {
           LoadJson<std::int8_t>(jvalues, &columns.back());
           break;
         }
+        case T::kU8Array: {
+          LoadJson<std::uint8_t>(jvalues, &columns.back());
+          break;
+        }
         case T::kI16Array: {
           LoadJson<std::int16_t>(jvalues, &columns.back());
           break;
         }
+        case T::kU16Array: {
+          LoadJson<std::uint16_t>(jvalues, &columns.back());
+          break;
+        }
         case T::kI32Array: {
           LoadJson<std::int32_t>(jvalues, &columns.back());
           break;
         }
+        case T::kU32Array: {
+          LoadJson<std::uint32_t>(jvalues, &columns.back());
+          break;
+        }
         case T::kI64Array: {
           LoadJson<std::int64_t>(jvalues, &columns.back());
           break;
         }
+        case T::kU64Array: {
+          LoadJson<std::uint64_t>(jvalues, &columns.back());
+          break;
+        }
         case T::kF32Array: {
           LoadJson<float>(jvalues, &columns.back());
           break;
diff --git a/src/data/cat_container.cu b/src/data/cat_container.cu
index eec833a2897c..ff16e6229441 100644
--- a/src/data/cat_container.cu
+++ b/src/data/cat_container.cu
@@ -6,8 +6,10 @@
 #include <memory>  // for make_unique
 #include <vector>  // for vector
 
+#include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for ToSpan
 #include "../common/device_vector.cuh"   // for device_vector
+#include "../common/type.h"              // for GetValueT
 #include "../encoder/ordinal.cuh"        // for SortNames
 #include "../encoder/ordinal.h"          // for DictionaryView
 #include "../encoder/types.h"            // for Overloaded
@@ -22,41 +24,49 @@ struct CatContainerImpl {
   dh::device_vector<enc::DeviceCatIndexView> columns_v;
 
   template <typename VariantT>
-  void CopyFrom(enc::detail::ColumnsViewImpl<VariantT> that) {
+  void CopyFrom(Context const* ctx, enc::detail::ColumnsViewImpl<VariantT> that) {
     this->columns.resize(that.columns.size());
     this->columns_v.resize(that.columns.size());
     CHECK_EQ(this->columns.size(), this->columns_v.size());
+    auto stream = ctx->CUDACtx()->Stream();
 
     std::vector<decltype(columns_v)::value_type> h_columns_v(this->columns_v.size());
     for (std::size_t f_idx = 0, n = that.columns.size(); f_idx < n; ++f_idx) {
       auto const& col_v = that.columns[f_idx];
       auto dispatch = enc::Overloaded{
-          [this, f_idx, &h_columns_v](enc::CatStrArrayView const& str) {
+          [this, f_idx, &h_columns_v, stream](enc::CatStrArrayView const& str) {
             this->columns[f_idx].emplace<CatStrArray>();
             auto& col = std::get<CatStrArray>(this->columns[f_idx]);
             // Handle the offsets
             col.offsets.resize(str.offsets.size());
-            dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(col.offsets.data()),
-                                          str.offsets.data(), str.offsets.size_bytes(),
-                                          cudaMemcpyDefault));
+            if (!str.offsets.empty()) {
+              dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(col.offsets.data()),
+                                            str.offsets.data(), str.offsets.size_bytes(),
+                                            cudaMemcpyDefault, stream));
+            }
             // Handle the values
             col.values.resize(str.values.size());
-            dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(col.values.data()),
-                                          str.values.data(), str.values.size_bytes(),
-                                          cudaMemcpyDefault));
+            if (!col.values.empty()) {
+              dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(col.values.data()),
+                                            str.values.data(), str.values.size_bytes(),
+                                            cudaMemcpyDefault, stream));
+            }
             // Create the view
             h_columns_v[f_idx].emplace<enc::CatStrArrayView>();
             auto& col_v = cuda::std::get<enc::CatStrArrayView>(h_columns_v[f_idx]);
             col_v = {dh::ToSpan(col.offsets), dh::ToSpan(col.values)};
           },
-          [this, f_idx, &h_columns_v](auto&& values) {
+          [this, f_idx, &h_columns_v, stream](auto&& values) {
             using T = std::remove_cv_t<typename std::decay_t<decltype(values)>::value_type>;
 
             this->columns[f_idx].emplace<dh::device_vector<T>>();
             auto& col = std::get<dh::device_vector<T>>(this->columns[f_idx]);
 
             col.resize(values.size());
-            thrust::copy_n(values.data(), values.size(), col.data());
+            if (!values.empty()) {
+              dh::safe_cuda(cudaMemcpyAsync(col.data().get(), values.data(), values.size_bytes(),
+                                            cudaMemcpyDefault, stream));
+            }
 
             // Create the view
             using V = common::Span<std::add_const_t<T>>;
@@ -65,7 +75,7 @@ struct CatContainerImpl {
             col_v = dh::ToSpan(col);
           }};
       auto visit = [&](auto const& col) {
-        using ColT = std::remove_cv_t<std::remove_reference_t<decltype(col)>>;
+        using ColT = common::GetValueT<decltype(col)>;
         if constexpr (std::is_same_v<ColT, enc::HostCatIndexView>) {
           std::visit(dispatch, col);
         } else {
@@ -93,14 +103,18 @@ struct CatContainerImpl {
                        auto& out_str = std::get<cpu_impl::CatStrArray>(out_col);
                        // Offsets
                        out_str.offsets.resize(str.offsets.size());
-                       dh::safe_cuda(cudaMemcpyAsync(
-                           out_str.offsets.data(), thrust::raw_pointer_cast(str.offsets.data()),
-                           common::Span{out_str.offsets}.size_bytes(), cudaMemcpyDefault));
+                       if (!out_str.offsets.empty()) {
+                         dh::safe_cuda(cudaMemcpyAsync(
+                             out_str.offsets.data(), thrust::raw_pointer_cast(str.offsets.data()),
+                             common::Span{out_str.offsets}.size_bytes(), cudaMemcpyDefault));
+                       }
                        // Values
                        out_str.values.resize(str.values.size());
-                       dh::safe_cuda(cudaMemcpyAsync(
-                           out_str.values.data(), thrust::raw_pointer_cast(str.values.data()),
-                           common::Span{out_str.values}.size_bytes(), cudaMemcpyDefault));
+                       if (!out_str.values.empty()) {
+                         dh::safe_cuda(cudaMemcpyAsync(
+                             out_str.values.data(), thrust::raw_pointer_cast(str.values.data()),
+                             common::Span{out_str.values}.size_bytes(), cudaMemcpyDefault));
+                       }
                      },
                      [&](auto&& values) {
                        using T0 = decltype(values);
@@ -109,9 +123,11 @@ struct CatContainerImpl {
                        out_col.emplace<Vec>();
                        auto& out_vec = std::get<Vec>(out_col);
                        out_vec.resize(values.size());
-                       dh::safe_cuda(
-                           cudaMemcpyAsync(out_vec.data(), thrust::raw_pointer_cast(values.data()),
-                                           common::Span{out_vec}.size_bytes(), cudaMemcpyDefault));
+                       if (!out_vec.empty()) {
+                         dh::safe_cuda(cudaMemcpyAsync(
+                             out_vec.data(), thrust::raw_pointer_cast(values.data()),
+                             common::Span{out_vec}.size_bytes(), cudaMemcpyDefault));
+                       }
                      }},
                  col);
     }
@@ -124,19 +140,20 @@ CatContainer::CatContainer()  // NOLINT
     : cpu_impl_{std::make_unique<cpu_impl::CatContainerImpl>()},
       cu_impl_{std::make_unique<cuda_impl::CatContainerImpl>()} {}
 
-CatContainer::CatContainer(DeviceOrd device, enc::DeviceColumnsView const& df) : CatContainer{} {
+CatContainer::CatContainer(Context const* ctx, enc::DeviceColumnsView const& df) : CatContainer{} {
   this->n_total_cats_ = df.n_total_cats;
 
-  this->feature_segments_.SetDevice(device);
+  this->feature_segments_.SetDevice(ctx->Device());
   this->feature_segments_.Resize(df.feature_segments.size());
   auto d_segs = this->feature_segments_.DeviceSpan();
-  thrust::copy_n(dh::tcbegin(df.feature_segments), df.feature_segments.size(), dh::tbegin(d_segs));
+  thrust::copy_n(ctx->CUDACtx()->CTP(), dh::tcbegin(df.feature_segments),
+                 df.feature_segments.size(), dh::tbegin(d_segs));
 
   // FIXME(jiamingy): We can use a single kernel for copying data once cuDF can return
-  // device data.
-  this->cu_impl_->CopyFrom(df);
+  // device data. Remove this along with the one in the device cuDF adapter.
+  this->cu_impl_->CopyFrom(ctx, df);
 
-  this->sorted_idx_.SetDevice(device);
+  this->sorted_idx_.SetDevice(ctx->Device());
   this->sorted_idx_.Resize(0);
   if (this->n_total_cats_ > 0) {
     CHECK(this->DeviceCanRead());
@@ -253,13 +270,14 @@ void CatContainer::Sort(Context const* ctx) {
     this->feature_segments_.ConstDeviceSpan();
     // Lazy copy to device
     auto h_view = this->HostViewImpl();
-    this->cu_impl_->CopyFrom(h_view);
+    this->cu_impl_->CopyFrom(ctx, h_view);
     CHECK_EQ(this->cu_impl_->columns_v.size(), this->cpu_impl_->columns_v.size());
     CHECK_EQ(this->cu_impl_->columns.size(), this->cpu_impl_->columns.size());
   }
   CHECK(this->DeviceCanRead());
   if (this->n_total_cats_ != 0) {
     CHECK(!this->cu_impl_->columns_v.empty());
+    CHECK_EQ(this->feature_segments_.Size(), this->cu_impl_->columns_v.size() + 1);
   }
   return {dh::ToSpan(this->cu_impl_->columns_v), this->feature_segments_.ConstDeviceSpan(),
           this->n_total_cats_};
diff --git a/src/data/cat_container.cuh b/src/data/cat_container.cuh
index 9522a97c856a..ab5586b42ae2 100644
--- a/src/data/cat_container.cuh
+++ b/src/data/cat_container.cuh
@@ -4,6 +4,7 @@
 #pragma once
 #include "../common/device_helpers.cuh"  // for ToSpan
 #include "../common/device_vector.cuh"   // for device_vector, XGBDeviceAllocator
+#include "../encoder/ordinal.cuh"        // for Recode
 #include "../encoder/ordinal.h"          // for CatCharT
 #include "cat_container.h"               // for EncErrorPolicy
 
@@ -71,4 +72,16 @@ struct EncThrustPolicy {
 using EncPolicyT = enc::Policy<EncErrorPolicy, EncThrustPolicy>;
 
 inline EncPolicyT EncPolicy = EncPolicyT{};
+
+inline auto MakeCatAccessor(Context const* ctx, enc::DeviceColumnsView const& new_enc,
+                            CatContainer const* orig_cats) {
+  dh::DeviceUVector<std::int32_t> mapping(new_enc.n_total_cats);
+  auto d_sorted_idx = orig_cats->RefSortedIndex(ctx);
+  auto orig_enc = orig_cats->DeviceView(ctx);
+  enc::Recode(EncPolicy, orig_enc, d_sorted_idx, new_enc, dh::ToSpan(mapping));
+  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
+  auto cats_mapping = enc::MappingView{new_enc.feature_segments, dh::ToSpan(mapping)};
+  auto acc = CatAccessor{cats_mapping};
+  return std::tuple{acc, std::move(mapping)};
+}
 }  // namespace xgboost::cuda_impl
diff --git a/src/data/cat_container.h b/src/data/cat_container.h
index d80e7b07b6e5..c95240826048 100644
--- a/src/data/cat_container.h
+++ b/src/data/cat_container.h
@@ -3,17 +3,20 @@
  */
 #pragma once
 
-#include <xgboost/base.h>  // for bst_cat_t
-
 #include <cstdint>  // for int32_t, int8_t
 #include <memory>   // for unique_ptr
 #include <mutex>    // for mutex
 #include <string>   // for string
 #include <tuple>    // for tuple
+#include <utility>  // for move
 #include <vector>   // for vector
 
+#include "../common/categorical.h"       // for AsCat
 #include "../encoder/ordinal.h"          // for CatStrArrayView
 #include "../encoder/types.h"            // for Overloaded
+#include "entry.h"                       // for COOTuple
+#include "xgboost/base.h"                // for bst_cat_t
+#include "xgboost/data.h"                // for Entry
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/json.h"                // for Json
 
@@ -143,7 +146,7 @@ class CatContainer {
   CatContainer();
   explicit CatContainer(enc::HostColumnsView const& df);
 #if defined(XGBOOST_USE_CUDA)
-  explicit CatContainer(DeviceOrd device, enc::DeviceColumnsView const& df);
+  explicit CatContainer(Context const* ctx, enc::DeviceColumnsView const& df);
 #endif  // defined(XGBOOST_USE_CUDA)
   ~CatContainer();
 
@@ -218,4 +221,57 @@ class CatContainer {
   std::unique_ptr<cuda_impl::CatContainerImpl> cu_impl_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };
+
+/**
+ * @brief Accessor for obtaining re-coded categories.
+ */
+struct CatAccessor {
+  enc::MappingView enc;
+
+  template <typename T, typename Fidx>
+  [[nodiscard]] XGBOOST_DEVICE T operator()(T fvalue, Fidx f_idx) const {
+    if (!enc.Empty() && !enc[f_idx].empty()) {
+      auto f_mapping = enc[f_idx];
+      auto cat_idx = common::AsCat(fvalue);
+      if (cat_idx >= 0 && cat_idx < common::AsCat(f_mapping.size())) {
+        fvalue = f_mapping.data()[cat_idx];
+      }
+    }
+    return fvalue;
+  }
+  [[nodiscard]] XGBOOST_DEVICE float operator()(Entry const& e) const {
+    return this->operator()(e.fvalue, e.index);
+  }
+  [[nodiscard]] XGBOOST_DEVICE float operator()(data::COOTuple const& e) const {
+    return this->operator()(e.value, e.column_idx);
+  }
+};
+
+/**
+ * @brief No-op accessor used to handle numeric data.
+ */
+struct NoOpAccessor {
+  XGBOOST_DEVICE explicit NoOpAccessor(enc::MappingView const&) {}
+  NoOpAccessor() = default;
+  template <typename T, typename Fidx>
+  [[nodiscard]] XGBOOST_DEVICE T operator()(T fvalue, Fidx) const {
+    return fvalue;
+  }
+  [[nodiscard]] XGBOOST_DEVICE float operator()(data::COOTuple const& e) const { return e.value; }
+  [[nodiscard]] XGBOOST_DEVICE float operator()(Entry const& e) const { return e.fvalue; }
+};
+
+namespace cpu_impl {
+inline auto MakeCatAccessor(Context const* ctx, enc::HostColumnsView const& new_enc,
+                            CatContainer const* orig_cats) {
+  std::vector<std::int32_t> mapping(new_enc.n_total_cats);
+  auto sorted_idx = orig_cats->RefSortedIndex(ctx);
+  auto orig_enc = orig_cats->HostView();
+  enc::Recode(cpu_impl::EncPolicy, orig_enc, sorted_idx, new_enc, common::Span{mapping});
+  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
+  auto cats_mapping = enc::MappingView{new_enc.feature_segments, mapping};
+  auto acc = CatAccessor{cats_mapping};
+  return std::tuple{acc, std::move(mapping)};
+}
+}  // namespace cpu_impl
 }  // namespace xgboost
diff --git a/src/data/columnar.h b/src/data/columnar.h
new file mode 100644
index 000000000000..28ffa372554a
--- /dev/null
+++ b/src/data/columnar.h
@@ -0,0 +1,167 @@
+/**
+ *  Copyright 2025, XGBoost Contributors
+ *
+ * @brief Helpers for handling columnar data with adapters.
+ */
+#pragma once
+
+#include <algorithm>    // for max
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <type_traits>  // for is_floating_point_v
+#include <vector>       // for vector
+
+#include "../common/error_msg.h"  // for NoFloatCat
+#include "../encoder/ordinal.h"   // for CatStrArrayView
+#include "array_interface.h"      // for ArrayInterfaceHandler
+#include "xgboost/context.h"      // for DeviceOrd
+#include "xgboost/json.h"         // for Json, Object
+#include "xgboost/span.h"         // for Span
+
+#if !defined(XGBOOST_USE_CUDA)
+#include "../common/common.h"  // for AssertGPUSupport
+#else
+#include <cuda_runtime_api.h>  // for cudaMemcpy
+#endif
+
+namespace xgboost::data {
+/**
+ * @brief Get string-based category index from arrow.
+ *
+ * @return The extracted category index
+ */
+template <typename CategoricalIndex>
+auto GetArrowNames(Object::Map const& jnames, std::vector<CategoricalIndex>* p_cat_columns) {
+  auto& cat_columns = *p_cat_columns;
+  // There are 3 buffers for a StringArray, validity mask, offset, and data. Mask
+  // and data are represented by a single masked array.
+  auto const& joffset = get<Object const>(jnames.at("offsets"));
+  auto offset = ArrayInterface<1>{joffset};
+  auto const& jstr = get<Object const>(jnames.at("values"));
+  auto strbuf = ArrayInterface<1>(jstr);
+
+  // Obtain the size of the string buffer using the offset
+  CHECK_GE(offset.n, 2);
+  auto offset_last_idx = offset.n - 1;
+  if (ArrayInterfaceHandler::IsCudaPtr(offset.data)) {
+    CHECK_EQ(strbuf.n, 0);  // Unknown
+#if defined(XGBOOST_USE_CUDA)
+    DispatchDType(offset.type, [&](auto t) {
+      using T = decltype(t);
+      if (!std::is_same_v<T, std::int32_t>) {
+        LOG(FATAL) << "Invalid type for the string offset from category index.";
+      }
+#if defined(__CUDACC__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 20208  // long double is treated as double in device code
+#endif  // defined(__CUDACC__)
+      T back{0};
+      dh::safe_cuda(cudaMemcpy(&back, static_cast<T const*>(offset.data) + offset_last_idx,
+                               sizeof(T), cudaMemcpyDeviceToHost));
+      strbuf.n = back;
+#if defined(__CUDACC__)
+#pragma nv_diagnostic pop
+#endif  // defined(__CUDACC__)
+    });
+#else
+    common::AssertGPUSupport();
+#endif
+  } else {
+    DispatchDType(offset.type, [&](auto t) {
+      using T = decltype(t);
+      if (!std::is_same_v<T, std::int32_t>) {
+        LOG(FATAL) << "Invalid type for the string offset from category index.";
+      }
+      auto back = offset(offset_last_idx);
+      strbuf.n = back;
+    });
+  }
+
+  CHECK_EQ(strbuf.type, ArrayInterfaceHandler::kI1);
+  CHECK_EQ(offset.type, ArrayInterfaceHandler::kI4);
+  auto names = enc::CatStrArrayView{
+      common::Span{static_cast<std::int32_t const*>(offset.data), offset.Shape<0>()},
+      common::Span<std::int8_t const>{reinterpret_cast<std::int8_t const*>(strbuf.data), strbuf.n}};
+  cat_columns.emplace_back(names);
+  return names;
+}
+
+/**
+ * @brief Get string names and codes for categorical features.
+ *
+ * @return The number of categories for the current column.
+ */
+template <typename CategoricalIndex, bool allow_mask>
+[[nodiscard]] std::size_t GetArrowDictionary(Json const& jcol,
+                                             std::vector<CategoricalIndex>* p_cat_columns,
+                                             std::vector<ArrayInterface<1, allow_mask>>* p_columns,
+                                             std::size_t* p_n_bytes, bst_idx_t* p_n_samples) {
+  auto const& tup = get<Array const>(jcol);
+  CHECK_EQ(tup.size(), 2);
+
+  auto names = GetArrowNames(get<Object const>(tup[0]), p_cat_columns);
+
+  // arrow Integer array for encoded categories
+  auto const& jcodes = get<Object const>(tup[1]);
+  auto codes = ArrayInterface<1>{jcodes};
+  p_columns->push_back(codes);
+
+  auto& n_bytes = *p_n_bytes;
+  n_bytes += codes.ElementSize() * codes.Shape<0>();
+  n_bytes += names.SizeBytes();
+
+  *p_n_samples = std::max(*p_n_samples, static_cast<bst_idx_t>(codes.Shape<0>()));
+  return names.size();
+}
+
+/**
+ * @brief Get numeric-based category index from arrow.
+ *
+ * @return The extracted category index
+ */
+template <typename CategoricalIndex>
+[[nodiscard]] std::size_t GetArrowNumericNames(DeviceOrd device, Object::Map const& jnames,
+                                               std::vector<CategoricalIndex>* p_cat_columns,
+                                               std::size_t* p_n_bytes) {
+  auto names = ArrayInterface<1>{jnames};
+  auto& n_bytes = *p_n_bytes;
+  DispatchDType(names, device, [&](auto t) {
+    using T = typename decltype(t)::value_type;
+    constexpr bool kKnownType = enc::MemberOf<std::remove_cv_t<T>, enc::CatPrimIndexTypes>::value;
+    CHECK(kKnownType) << "Unsupported categorical index type: `"
+                      << ArrayInterfaceHandler::TypeStr(names.type) << "`.";
+    if constexpr (std::is_floating_point_v<T>) {
+      LOG(FATAL) << error::NoFloatCat();
+    }
+    auto span = common::Span{t.Values().data(), t.Size()};
+    if constexpr (kKnownType) {
+      p_cat_columns->emplace_back(span);
+      n_bytes += span.size_bytes();
+    }
+  });
+  return names.n;
+}
+
+/**
+ * @brief Get numeric names and codes for categorical features.
+ *
+ * @return The number of categories for the current column.
+ */
+template <typename CategoricalIndex, bool allow_mask>
+[[nodiscard]] std::size_t GetArrowNumericIndex(
+    DeviceOrd device, Json jcol, std::vector<CategoricalIndex>* p_cat_columns,
+    std::vector<ArrayInterface<1, allow_mask>>* p_columns, std::size_t* p_n_bytes,
+    bst_idx_t* p_n_samples) {
+  auto const& first = get<Object const>(jcol[0]);
+  auto n_cats = GetArrowNumericNames(device, first, p_cat_columns, p_n_bytes);
+  auto& n_bytes = *p_n_bytes;
+  auto const& jcodes = get<Object const>(jcol[1]);
+  auto codes = ArrayInterface<1>{jcodes};
+  p_columns->push_back(codes);
+
+  n_bytes += codes.ElementSize() * codes.Shape<0>();
+  *p_n_samples = std::max(*p_n_samples, static_cast<bst_idx_t>(codes.Shape<0>()));
+
+  return n_cats;
+}
+}  // namespace xgboost::data
diff --git a/src/data/data.cc b/src/data/data.cc
index bcfd5f06a072..bc78e3ffc928 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -30,7 +30,8 @@
 #include "../common/numeric.h"                // for Iota, RunLengthEncode
 #include "../common/threading_utils.h"        // for ParallelFor
 #include "../common/version.h"                // for Version
-#include "../data/adapter.h"                  // for COOTuple, FileAdapter, IsValidFunctor
+#include "../data/adapter.h"                  // for FileAdapter
+#include "../data/entry.h"                    // for COOTuple, IsValidFunctor
 #include "../data/extmem_quantile_dmatrix.h"  // for ExtMemQuantileDMatrix
 #include "../data/iterative_dmatrix.h"        // for IterativeDMatrix
 #include "./sparse_page_dmatrix.h"            // for SparsePageDMatrix
@@ -1278,6 +1279,7 @@ INSTANTIATE_PUSH(CSRArrayAdapterBatch)
 INSTANTIATE_PUSH(CSCArrayAdapterBatch)
 INSTANTIATE_PUSH(FileAdapterBatch)
 INSTANTIATE_PUSH(ColumnarAdapterBatch)
+INSTANTIATE_PUSH(EncColumnarAdapterBatch)
 
 #undef INSTANTIATE_PUSH
 
diff --git a/src/data/device_adapter.cu b/src/data/device_adapter.cu
index 1462880eca1d..38a52ebfbb25 100644
--- a/src/data/device_adapter.cu
+++ b/src/data/device_adapter.cu
@@ -2,12 +2,37 @@
  * Copyright 2019-2025, XGBoost Contributors
  */
 #include "../common/cuda_rt_utils.h"  // for SetDevice, CurrentDevice
+#include "columnar.h"                 // for GetRefCats, GetArrowDictionary
 #include "device_adapter.cuh"
 
 namespace xgboost::data {
+namespace {
+auto GetRefCats(Context const* ctx, Json handle,
+                std::vector<enc::DeviceCatIndexView>* p_h_ref_cats) {
+  auto& h_ref_cats = *p_h_ref_cats;
+  auto cats = reinterpret_cast<CatContainer const*>(get<Integer const>(handle));
+  CHECK(cats);
+  auto d_cats = cats->DeviceView(ctx);
+  // FIXME(jiamingy): Remove this along with the host copy in the cat container once
+  // cuDF can return device-only data.
+  h_ref_cats.resize(d_cats.columns.size());
+  thrust::copy(dh::tcbegin(d_cats.columns), dh::tcend(d_cats.columns), h_ref_cats.begin());
+  d_cats.columns = common::Span{h_ref_cats};
+  return d_cats;
+}
+}  // anonymous namespace
+
 CudfAdapter::CudfAdapter(StringView cuda_arrinf) {
-  Json interfaces = Json::Load(cuda_arrinf);
-  std::vector<Json> const& jcolumns = get<Array>(interfaces);
+  Json jdf = Json::Load(cuda_arrinf);
+
+  if (IsA<Object>(jdf)) {
+    // Has reference categories.
+    auto ctx = Context{}.MakeCUDA(curt::CurrentDevice());
+    this->ref_cats_ = GetRefCats(&ctx, jdf["ref_categories"], &this->h_ref_cats_);
+    jdf = jdf["columns"];
+  }
+
+  std::vector<Json> const& jcolumns = get<Array>(jdf);
   std::size_t n_columns = jcolumns.size();
   CHECK_GT(n_columns, 0) << "The number of columns must not equal to 0.";
 
@@ -21,8 +46,13 @@ CudfAdapter::CudfAdapter(StringView cuda_arrinf) {
       auto const& first = get<Object const>(jcol[0]);
       if (first.find("offsets") == first.cend()) {
         // numeric index
-        n_cats =
-            GetArrowNumericIndex(DeviceOrd::CUDA(0), jcol, &cats_, &columns, &n_bytes_, &num_rows_);
+        if (device == -1) {
+          auto const& first = get<Object const>(jcol[0]);
+          auto names = ArrayInterface<1>{first};
+          device = dh::CudaGetPointerDevice(names.data);
+        }
+        n_cats = GetArrowNumericIndex(DeviceOrd::CUDA(device), jcol, &cats_, &columns, &n_bytes_,
+                                      &num_rows_);
       } else {
         // string index
         n_cats = GetArrowDictionary(jcol, &cats_, &columns, &n_bytes_, &num_rows_);
@@ -33,7 +63,7 @@ CudfAdapter::CudfAdapter(StringView cuda_arrinf) {
       columns.push_back(col);
       this->cats_.emplace_back();
       this->num_rows_ = std::max(num_rows_, col.Shape<0>());
-      CHECK_EQ(num_rows_, col.Shape<0>()) << "All columns should have same number of rows.";
+      CHECK_EQ(num_rows_, col.Shape<0>()) << "All columns should have the same number of rows.";
       n_bytes_ += col.ElementSize() * col.Shape<0>();
     }
     cat_segments.emplace_back(n_cats);
@@ -61,6 +91,11 @@ CudfAdapter::CudfAdapter(StringView cuda_arrinf) {
   curt::SetDevice(device_.ordinal);
 
   this->columns_ = columns;
-  batch_ = CudfAdapterBatch(dh::ToSpan(columns_), num_rows_);
+  batch_ = CudfAdapterBatch(dh::ToSpan(columns_), NoOpAccessor{}, num_rows_);
+
+  if (!this->ref_cats_.Empty()) {
+    CHECK_EQ(this->ref_cats_.Size(), this->columns_.size())
+        << "Invalid reference categories, different number of columns";
+  }
 }
 }  // namespace xgboost::data
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 672767db92c4..f140f816219b 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -12,30 +12,34 @@
 #include <cstddef>           // for size_t
 #include <cuda/std/variant>  // for variant
 #include <limits>            // for numeric_limits
+#include <memory>            // for make_unique
 #include <string>            // for string
 
 #include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
 #include "adapter.h"
 #include "array_interface.h"
+#include "cat_container.cuh"      // for MakeCatAccessor
 #include "xgboost/string_view.h"  // for StringView
 
 namespace xgboost::data {
-class CudfAdapterBatch : public detail::NoMetaInfo {
-  friend class CudfAdapter;
+template <typename EncAccessor>
+class EncCudfAdapterBatchImpl : public detail::NoMetaInfo {
+ private:
+  common::Span<ArrayInterface<1> const> columns_;
+  bst_idx_t n_samples_{0};
+  EncAccessor acc_;
 
  public:
-  CudfAdapterBatch() = default;
-  CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
-      : columns_(columns), num_rows_(num_rows) {}
-  [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
-  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
-    size_t column_idx = idx % columns_.size();
-    size_t row_idx = idx / columns_.size();
-    auto const& column = columns_[column_idx];
-    float value = column.valid.Data() == nullptr || column.valid.Check(row_idx)
-                      ? column(row_idx)
-                      : std::numeric_limits<float>::quiet_NaN();
+  EncCudfAdapterBatchImpl() = default;
+  EncCudfAdapterBatchImpl(common::Span<ArrayInterface<1> const> columns, EncAccessor acc,
+                          bst_idx_t n_samples)
+      : columns_(columns), n_samples_(n_samples), acc_{std::move(acc)} {}
+  [[nodiscard]] std::size_t Size() const { return n_samples_ * columns_.size(); }
+  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(bst_idx_t idx) const {
+    auto column_idx = idx % columns_.size();
+    auto row_idx = idx / columns_.size();
+    auto value = this->GetElement(row_idx, column_idx);
     return {row_idx, column_idx, value};
   }
 
@@ -44,64 +48,22 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                       ? column(ridx)
                       : std::numeric_limits<float>::quiet_NaN();
-    return value;
+    return acc_(value, fidx);
   }
 
-  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return n_samples_; }
   [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return columns_.size(); }
-
- private:
-  common::Span<ArrayInterface<1>> columns_;
-  size_t num_rows_{0};
+  [[nodiscard]] common::Span<ArrayInterface<1> const> Columns() const { return this->columns_; }
 };
 
-/*!
- * Please be careful that, in official specification, the only three required
- * fields are `shape', `version' and `typestr'.  Any other is optional,
- * including `data'.  But here we have one additional requirements for input
- * data:
- *
- * - `data' field is required, passing in an empty dataset is not accepted, as
- * most (if not all) of our algorithms don't have test for empty dataset.  An
- * error is better than a crash.
- *
- * What if invalid value from dataframe is 0 but I specify missing=NaN in
- * XGBoost?  Since validity mask is ignored, all 0s are preserved in XGBoost.
- *
- * FIXME(trivialfis): Put above into document after we have a consistent way for
- * processing input data.
+using CudfAdapterBatch = EncCudfAdapterBatchImpl<NoOpAccessor>;
+using EncCudfAdapterBatch = EncCudfAdapterBatchImpl<CatAccessor>;
+
+/**
+ * @brief Device columnar format. We call it cuDF, but it's just arrow-CUDA since cuDF
+ * adopts the arrow format.
  *
- * Sample input:
- * [
- *   {
- *     "shape": [
- *       10
- *     ],
- *     "strides": [
- *       4
- *     ],
- *     "data": [
- *       30074864128,
- *       false
- *     ],
- *     "typestr": "<f4",
- *     "version": 1,
- *     "mask": {
- *       "shape": [
- *         64
- *       ],
- *       "strides": [
- *         1
- *       ],
- *       "data": [
- *         30074864640,
- *         false
- *       ],
- *       "typestr": "|i1",
- *       "version": 1
- *     }
- *   }
- * ]
+ * See @ref XGDMatrixCreateFromColumnar for notes
  */
 class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
  public:
@@ -110,7 +72,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
       : CudfAdapter{StringView{cuda_interfaces_str}} {}
 
   [[nodiscard]] CudfAdapterBatch const& Value() const override {
-    CHECK_EQ(batch_.columns_.data(), columns_.data().get());
+    CHECK_EQ(batch_.Columns().data(), columns_.data().get());
     return batch_;
   }
 
@@ -125,7 +87,13 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
   [[nodiscard]] enc::DeviceColumnsView DCats() const {
     return {dh::ToSpan(this->d_cats_), dh::ToSpan(this->cat_segments_), this->n_total_cats_};
   }
-  [[nodiscard]] bool HasCategorical() const { return !(n_total_cats_ == 0); }
+  [[nodiscard]] enc::DeviceColumnsView RefCats() const { return ref_cats_; }
+  [[nodiscard]] bool HasCategorical() const { return n_total_cats_ != 0; }
+  [[nodiscard]] bool HasRefCategorical() const { return this->ref_cats_.n_total_cats != 0; }
+
+  [[nodiscard]] common::Span<ArrayInterface<1> const> Columns() const {
+    return dh::ToSpan(this->columns_);
+  }
 
  private:
   CudfAdapterBatch batch_;
@@ -137,6 +105,9 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
   dh::device_vector<std::int32_t> cat_segments_;
   std::int32_t n_total_cats_{0};
 
+  enc::DeviceColumnsView ref_cats_;                  // A view to the reference category.
+  std::vector<enc::DeviceCatIndexView> h_ref_cats_;  // host storage for column view
+
   size_t num_rows_{0};
   bst_idx_t n_bytes_{0};
   DeviceOrd device_{DeviceOrd::CPU()};
@@ -169,6 +140,14 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
   ArrayInterface<2> array_interface_;
 };
 
+inline auto MakeEncColumnarBatch(Context const* ctx, CudfAdapter const* adapter) {
+  auto cats = std::make_unique<CatContainer>(ctx, adapter->RefCats());
+  cats->Sort(ctx);
+  auto [acc, mapping] = ::xgboost::cuda_impl::MakeCatAccessor(ctx, adapter->DCats(), cats.get());
+  return std::tuple{EncCudfAdapterBatch{adapter->Columns(), acc, adapter->NumRows()},
+                    std::move(mapping)};
+}
+
 class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
  public:
   explicit CupyAdapter(StringView cuda_interface_str) {
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 185c6171ad61..bd1e6f0cc6b8 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -396,6 +396,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m
       std::shared_ptr<common::HistogramCuts const> cuts);
 
 ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
+ELLPACK_BATCH_SPECIALIZE(data::EncCudfAdapterBatch)
 ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
 
 #undef ELLPACK_BATCH_SPECIALIZE
diff --git a/src/data/entry.h b/src/data/entry.h
new file mode 100644
index 000000000000..eabc981eba4f
--- /dev/null
+++ b/src/data/entry.h
@@ -0,0 +1,38 @@
+/**
+ *  Copyright 2019-2025, XGBoost Contributors
+ */
+#pragma once
+
+#include "../common/math.h"  // for CheckNAN
+#include "xgboost/base.h"    // for bst_idx_t
+#include "xgboost/data.h"    // for Entry
+
+namespace xgboost::data {
+struct COOTuple {
+  COOTuple() = default;
+  XGBOOST_DEVICE COOTuple(bst_idx_t row_idx, bst_idx_t column_idx, float value)
+      : row_idx(row_idx), column_idx(column_idx), value(value) {}
+
+  bst_idx_t row_idx{0};
+  bst_idx_t column_idx{0};
+  float value{0};
+};
+
+struct IsValidFunctor {
+  float missing;
+
+  XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
+
+  XGBOOST_DEVICE bool operator()(float value) const {
+    return !(common::CheckNAN(value) || value == missing);
+  }
+
+  XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
+    return !(common::CheckNAN(e.value) || e.value == missing);
+  }
+
+  XGBOOST_DEVICE bool operator()(const Entry& e) const {
+    return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
+  }
+};
+}  // namespace xgboost::data
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index e9a1a7e329ff..465802aa48d8 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -130,6 +130,8 @@ INSTANTIATION_PUSH(data::CSRArrayAdapterBatch)
 INSTANTIATION_PUSH(data::ArrayAdapterBatch)
 INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 INSTANTIATION_PUSH(data::ColumnarAdapterBatch)
+INSTANTIATION_PUSH(data::EncColumnarAdapterBatch)
+
 #undef INSTANTIATION_PUSH
 
 void GHistIndexMatrix::ResizeColumns(double sparse_thresh) {
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index f715b291d5f5..8321eef55750 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -33,7 +33,7 @@ void DMatrixProxy::SetArray(StringView data) {
 }
 
 void DMatrixProxy::SetCsr(char const *c_indptr, char const *c_indices, char const *c_values,
-                              bst_feature_t n_features, bool on_host) {
+                          bst_feature_t n_features, bool on_host) {
   CHECK(on_host) << "Not implemented on device.";
   std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
       StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
@@ -49,14 +49,7 @@ void DMatrixProxy::SetCudaColumnar(StringView) { common::AssertGPUSupport(); }
 #endif  // !defined(XGBOOST_USE_CUDA)
 
 namespace cuda_impl {
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
 #if !defined(XGBOOST_USE_CUDA)
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
-                                                float) {
-  return nullptr;
-}
-
 [[nodiscard]] bst_idx_t BatchSamples(DMatrixProxy const *) {
   common::AssertGPUSupport();
   return 0;
@@ -65,6 +58,9 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr
   common::AssertGPUSupport();
   return 0;
 }
+#else
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
 #endif  // XGBOOST_USE_CUDA
 }  // namespace cuda_impl
 
@@ -73,21 +69,27 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
                                                 float missing) {
   bool type_error{false};
   std::shared_ptr<DMatrix> p_fmat{nullptr};
+
   if (proxy->Ctx()->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
     p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
+#else
+    common::AssertGPUSupport();
+#endif
   } else {
     p_fmat = data::HostAdapterDispatch<false>(
         proxy.get(),
         [&](auto const &adapter) {
           auto p_fmat =
               std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
+          CHECK_EQ(p_fmat->Info().num_row_, adapter->NumRows());
           return p_fmat;
         },
         &type_error);
   }
 
   CHECK(p_fmat) << "Failed to fallback.";
-  p_fmat->Info() = proxy->Info().Copy();
+  p_fmat->Info().Extend(proxy->Info(), /*accumulate_rows=*/false, true);
   return p_fmat;
 }
 }  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index 71095da33da3..6617cd314925 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -4,6 +4,7 @@
 #include "../encoder/ordinal.h"  // for DeviceColumnsView
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.cuh"
+#include "../common/type.h"  // for GetValueT
 #include "proxy_dmatrix.h"
 
 namespace xgboost::data {
@@ -41,6 +42,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
                                                 float missing) {
   return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
     auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
+    CHECK_EQ(p_fmat->Info().num_row_, adapter->NumRows());
     return p_fmat;
   });
 }
@@ -55,8 +57,11 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
 
 [[nodiscard]] enc::DeviceColumnsView BatchCats(DMatrixProxy const* proxy) {
   return Dispatch<false>(proxy, [&](auto const& adapter) {
-    using AdapterT = typename std::remove_reference_t<decltype(adapter)>::element_type;
+    using AdapterT = typename common::GetValueT<decltype(adapter)>::element_type;
     if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
+      if (adapter->HasRefCategorical()) {
+        return adapter->RefCats();
+      }
       return adapter->Cats();
     }
     return enc::DeviceColumnsView{};
diff --git a/src/data/proxy_dmatrix.cuh b/src/data/proxy_dmatrix.cuh
index db53b992df11..838819343153 100644
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -1,9 +1,10 @@
 /**
- * Copyright 2021-2023 XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
-#include <any>  // for any, any_cast
+#include <any>     // for any_cast
+#include <memory>  // for shared_ptr
 
-#include "device_adapter.cuh"
+#include "device_adapter.cuh"  // for MakeEncColumnarBatch
 #include "proxy_dmatrix.h"
 
 namespace xgboost::data::cuda_impl {
@@ -18,12 +19,16 @@ decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
       return fn(value);
     }
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
+    auto adapter = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
     if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+      auto value = adapter->Value();
+      if (adapter->HasRefCategorical()) {
+        auto [batch, mapping] = MakeEncColumnarBatch(proxy->Ctx(), adapter.get());
+        return fn(batch);
+      }
       return fn(value);
     } else {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
-      return fn(value);
+      return fn(adapter);
     }
   } else {
     LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index bc77d4368e69..9915c5ec5241 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -13,7 +13,8 @@
 
 #include "../common/nvtx_utils.h"  // for xgboost_NVTX_FN_RANGE
 #include "../encoder/ordinal.h"    // for HostColumnsView
-#include "adapter.h"               // for ColumnarAdapter, ArrayAdapter
+#include "adapter.h"               // for ColumnarAdapter, ArrayAdapter, MakeEncColumnarBatch
+#include "cat_container.h"         // for CatContainer
 #include "xgboost/c_api.h"         // for DataIterHandle
 #include "xgboost/context.h"       // for Context
 #include "xgboost/data.h"          // for MetaInfo
@@ -86,7 +87,7 @@ class DMatrixProxy : public DMatrix {
   void SetColumnar(StringView data);
   void SetArray(StringView data);
   void SetCsr(char const* c_indptr, char const* c_indices, char const* c_values,
-                  bst_feature_t n_features, bool on_host);
+              bst_feature_t n_features, bool on_host);
 
   MetaInfo& Info() override { return info_; }
   MetaInfo const& Info() const override { return info_; }
@@ -211,12 +212,16 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
       *type_error = false;
     }
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ColumnarAdapter>)) {
+    auto adapter = std::any_cast<std::shared_ptr<ColumnarAdapter>>(proxy->Adapter());
     if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<ColumnarAdapter>>(proxy->Adapter())->Value();
+      auto value = adapter->Value();
+      if (adapter->HasRefCategorical()) {
+        auto [batch, mapping] = MakeEncColumnarBatch(proxy->Ctx(), adapter.get());
+        return fn(batch);
+      }
       return fn(value);
     } else {
-      auto value = std::any_cast<std::shared_ptr<ColumnarAdapter>>(proxy->Adapter());
-      return fn(value);
+      return fn(adapter);
     }
     if (type_error) {
       *type_error = false;
@@ -239,6 +244,8 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
 
 /**
  * @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
+ *
+ *    This is used for enabling inplace-predict fallback.
  */
 std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
                                                 std::shared_ptr<DMatrixProxy> proxy, float missing);
@@ -278,11 +285,20 @@ namespace cuda_impl {
 }
 
 namespace cpu_impl {
-// Get categories for the current batch.
+/**
+ * @brief Get categories for the current batch.
+ *
+ * @param ref_if_avail Use the reference categories if present.
+ *
+ * @return A host view to the categories
+ */
 [[nodiscard]] inline decltype(auto) BatchCats(DMatrixProxy const* proxy) {
   return HostAdapterDispatch<false>(proxy, [](auto const& adapter) -> decltype(auto) {
     using AdapterT = typename std::remove_reference_t<decltype(adapter)>::element_type;
     if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
+      if (adapter->HasRefCategorical()) {
+        return adapter->RefCats();
+      }
       return adapter->Cats();
     }
     return enc::HostColumnsView{};
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index be13c260406b..ff81fefaa9f2 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -80,7 +80,7 @@ void MakeSketches(Context const* ctx,
     auto cats = cuda_impl::BatchCats(proxy);
     if (ext_info.n_features == 0) {
       ext_info.n_features = data::BatchColumns(proxy);
-      ext_info.cats = std::make_shared<CatContainer>(device, cats);
+      ext_info.cats = std::make_shared<CatContainer>(p_ctx, cats);
       auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
                                       collective::Op::kMax);
       SafeColl(rc);
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index f5aa4e37c609..a4e5913d767b 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -242,11 +242,21 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   uint64_t total_batch_size = 0;
   // batch_size is either number of rows or cols, depending on data layout
 
+  auto push_page = [&](auto const& batch) {
+    if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
+      if (adapter->HasRefCategorical()) {
+        auto [enc_batch, mapping] = MakeEncColumnarBatch(&ctx, adapter);
+        return sparse_page_->Push(enc_batch, missing, ctx.Threads());
+      }
+    }
+    return sparse_page_->Push(batch, missing, ctx.Threads());
+  };
+
   adapter->BeforeFirst();
   // Iterate over batches of input data
   while (adapter->Next()) {
     auto& batch = adapter->Value();
-    auto batch_max_columns = sparse_page_->Push(batch, missing, ctx.Threads());
+    bst_idx_t batch_max_columns = push_page(batch);
     inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
     total_batch_size += batch.Size();
     // Append meta information if available
@@ -294,7 +304,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   }
 
   if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
-    if (adapter->HasCategorical()) {
+    if (adapter->HasRefCategorical()) {
+      info_.Cats(std::make_shared<CatContainer>(adapter->RefCats()));
+    } else if (adapter->HasCategorical()) {
       info_.Cats(std::make_shared<CatContainer>(adapter->Cats()));
     }
   }
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index f502f5ee56c8..de50441f8efb 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -5,7 +5,7 @@
 #include <cstdint>  // for int32_t, int8_t
 #include <memory>   // for make_shared
 
-#include "../common/cuda_rt_utils.h"  // for CurrentDevice
+#include "../common/cuda_rt_utils.h"  // for CurrentDevice, SetDevice
 #include "cat_container.h"            // for CatContainer
 #include "device_adapter.cuh"
 #include "simple_dmatrix.cuh"
@@ -21,12 +21,12 @@ template <typename AdapterT>
 SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthread,
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
-      << "Column-wise data split is currently not supported on the GPU.";
+      << "Column-wise data split is currently not supported by the GPU.";
   auto device = (!adapter->Device().IsCUDA() || adapter->NumRows() == 0)
                     ? DeviceOrd::CUDA(curt::CurrentDevice())
                     : adapter->Device();
   CHECK(device.IsCUDA());
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  curt::SetDevice(device.ordinal);
 
   Context ctx;
   ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
@@ -40,14 +40,27 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
   // Enforce single batch
   CHECK(!adapter->Next());
 
-  info_.num_nonzero_ =
-      CopyToSparsePage(&ctx, adapter->Value(), device, missing, sparse_page_.get());
+  auto copy_page = [&] {
+    if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
+      if (adapter->HasRefCategorical()) {
+        auto [batch, mapping] = MakeEncColumnarBatch(&ctx, adapter);
+        info_.num_nonzero_ = CopyToSparsePage(&ctx, batch, device, missing, sparse_page_.get());
+        return;
+      }
+    }
+    info_.num_nonzero_ =
+        CopyToSparsePage(&ctx, adapter->Value(), device, missing, sparse_page_.get());
+  };
+
+  copy_page();
   info_.num_col_ = adapter->NumColumns();
   info_.num_row_ = adapter->NumRows();
 
   if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
-    if (adapter->HasCategorical()) {
-      info_.Cats(std::make_shared<CatContainer>(adapter->Device(), adapter->Cats()));
+    if (adapter->HasRefCategorical()) {
+      info_.Cats(std::make_shared<CatContainer>(&ctx, adapter->RefCats()));
+    } else if (adapter->HasCategorical()) {
+      info_.Cats(std::make_shared<CatContainer>(&ctx, adapter->Cats()));
     }
   }
   this->info_.SynchronizeNumberOfColumns(&ctx, data_split_mode);
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 62b822810ae8..fe91fe88b93e 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -51,7 +51,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
     } else {
       common::AssertGPUSupport();
 #if defined(XGBOOST_USE_CUDA)
-      return std::make_shared<CatContainer>(proxy->Ctx()->Device(), cuda_impl::BatchCats(proxy));
+      return std::make_shared<CatContainer>(proxy->Ctx(), cuda_impl::BatchCats(proxy));
 #else
       common::AssertGPUSupport();
       return std::make_shared<CatContainer>();
diff --git a/src/encoder/ordinal.cuh b/src/encoder/ordinal.cuh
index 42300e4c38ef..daff792f05cf 100644
--- a/src/encoder/ordinal.cuh
+++ b/src/encoder/ordinal.cuh
@@ -223,19 +223,26 @@ void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc,
    * Check consistency.
    */
   auto check_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
-        auto l_f = orig_enc.columns[i];
-        auto r_f = new_enc.columns[i];
+      thrust::make_counting_iterator(0ul),
+      cuda::proclaim_return_type<bool>([=] __device__(std::size_t i) {
+        auto const& l_f = orig_enc.columns[i];
+        auto const& r_f = new_enc.columns[i];
+        if (l_f.index() != r_f.index()) {
+          return false;
+        }
         auto l_is_empty = cuda::std::visit([](auto&& arg) { return arg.empty(); }, l_f);
         auto r_is_empty = cuda::std::visit([](auto&& arg) { return arg.empty(); }, r_f);
         return l_is_empty == r_is_empty;
-      });
-  bool valid = thrust::reduce(exec, check_it, check_it + new_enc.Size(), true,
-                              [=] XGBOOST_DEVICE(bool l, bool r) { return l && r; });
+      }));
+  bool valid = thrust::reduce(
+      exec, check_it, check_it + new_enc.Size(), true,
+      cuda::proclaim_return_type<bool>([=] __device__(bool l, bool r) { return l && r; }));
   if (!valid) {
     policy.Error(
         "Invalid new DataFrame. "
-        "The data type doesn't match the one used in the training dataset.");
+        "The data type doesn't match the one used in the training dataset. "
+        "Both should be either numeric or categorical. "
+        "For a categorical feature, the index type must match between the training and test set.");
   }
 
   /**
diff --git a/src/encoder/ordinal.h b/src/encoder/ordinal.h
index d4de6d0c8a59..999f5a447102 100644
--- a/src/encoder/ordinal.h
+++ b/src/encoder/ordinal.h
@@ -81,7 +81,8 @@ struct CatStrArrayView {
  * @brief All the primitive types supported by the encoder.
  */
 using CatPrimIndexTypes =
-    std::tuple<std::int8_t, std::int16_t, std::int32_t, std::int64_t, float, double>;
+    std::tuple<std::uint8_t, std::int8_t, std::uint16_t, std::int16_t, std::uint32_t, std::int32_t,
+               std::uint64_t, std::int64_t, float, double>;
 
 /**
  * @brief All the column types supported by the encoder.
@@ -343,13 +344,23 @@ void Recode(ExecPolicy const &policy, HostColumnsView orig_enc, Span<std::int32_
 
   std::size_t out_idx = 0;
   for (std::size_t f_idx = 0, n_features = orig_enc.Size(); f_idx < n_features; f_idx++) {
-    bool is_empty = std::visit([](auto &&arg) { return arg.empty(); }, orig_enc.columns[f_idx]);
-    bool new_is_empty = std::visit([](auto &&arg) { return arg.empty(); }, new_enc.columns[f_idx]);
-    if (is_empty != new_is_empty) {
+    auto const& l_f = orig_enc.columns[f_idx];
+    auto const& r_f = new_enc.columns[f_idx];
+    auto report = [&] {
       std::stringstream ss;
-      ss << "Invalid new DataFrame input for the: " << f_idx
-         << "th feature. The data type doesn't match the one used in the training dataset.";
+      ss << "Invalid new DataFrame input for the: " << f_idx << "th feature (0-based). "
+         << "The data type doesn't match the one used in the training dataset."
+         << "Both should be either numeric or categorical. For a categorical feature, the index "
+            "type must match between the training and test set.";
       policy.Error(ss.str());
+    };
+    if (l_f.index() != r_f.index()) {
+      report();
+    }
+    bool is_empty = std::visit([](auto &&arg) { return arg.empty(); }, l_f);
+    bool new_is_empty = std::visit([](auto &&arg) { return arg.empty(); }, r_f);
+    if (is_empty != new_is_empty) {
+      report();
     }
     if (is_empty) {
       continue;
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index bc7725e97f78..7f418864f268 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -139,6 +139,7 @@ struct GBTreeModel : public Model {
 
   [[nodiscard]] CatContainer const* Cats() const { return this->cats_.get(); }
   [[nodiscard]] CatContainer* Cats() { return this->cats_.get(); }
+  [[nodiscard]] std::shared_ptr<CatContainer> CatsShared() const { return this->cats_; }
   void Cats(std::shared_ptr<CatContainer> cats) { this->cats_ = cats; }
 
  private:
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 3610a58132b0..2d2aea31a9b6 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -178,8 +178,8 @@ class SparsePageView : public DataToFeatVec<SparsePageView<EncAccessor>> {
  public:
   bst_idx_t const base_rowid;
 
-  SparsePageView(SparsePage const *p, EncAccessor &&acc)
-      : acc_{std::forward<EncAccessor>(acc)}, view_{p->GetView()}, base_rowid{p->base_rowid} {}
+  SparsePageView(HostSparsePageView const p, bst_idx_t base_rowid, EncAccessor &&acc)
+      : acc_{std::forward<EncAccessor>(acc)}, view_{p}, base_rowid{base_rowid} {}
   [[nodiscard]] std::size_t Size() const { return view_.Size(); }
 
   [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float *out) const {
@@ -367,17 +367,7 @@ static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
   }
 }
 
-auto MakeCatAccessor(Context const *ctx, enc::HostColumnsView const &new_enc,
-                     gbm::GBTreeModel const &model) {
-  std::vector<std::int32_t> mapping(new_enc.n_total_cats);
-  auto sorted_idx = model.Cats()->RefSortedIndex(ctx);
-  auto orig_enc = model.Cats()->HostView();
-  enc::Recode(cpu_impl::EncPolicy, orig_enc, sorted_idx, new_enc, common::Span{mapping});
-  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
-  auto cats_mapping = enc::MappingView{new_enc.feature_segments, mapping};
-  auto acc = CatAccessor{cats_mapping};
-  return std::tuple{acc, std::move(mapping)};
-}
+using cpu_impl::MakeCatAccessor;
 
 bool ShouldUseBlock(DMatrix *p_fmat) {
   // Threshold to use block-based prediction.
@@ -453,7 +443,8 @@ class ColumnSplitHelper {
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(),
                p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
-      PredictBatchKernel<kBlockOfRowsSize>(ctx, SparsePageView{&batch, NoOpAccessor{}}, out_preds);
+      PredictBatchKernel<kBlockOfRowsSize>(
+          ctx, SparsePageView{batch.GetView(), batch.base_rowid, NoOpAccessor{}}, out_preds);
     }
   }
 
@@ -463,8 +454,8 @@ class ColumnSplitHelper {
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * (tree_end_ - tree_begin_));
-      PredictBatchKernel<kBlockOfRowsSize, true>(ctx, SparsePageView{&batch, NoOpAccessor{}},
-                                                 out_preds);
+      PredictBatchKernel<kBlockOfRowsSize, true>(
+          ctx, SparsePageView{batch.GetView(), batch.base_rowid, NoOpAccessor{}}, out_preds);
     }
   }
 
@@ -722,7 +713,7 @@ class CPUPredictor : public Predictor {
       } else {
         // Run prediction on SparsePage
         for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
-          auto batch = SparsePageView{&page, std::forward<Enc>(acc)};
+          auto batch = SparsePageView{page.GetView(), page.base_rowid, std::forward<Enc>(acc)};
           if (blocked) {
             PredictBatchByBlockOfRowsKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
                                                               &feat_vecs, n_threads, out_predt);
@@ -735,7 +726,7 @@ class CPUPredictor : public Predictor {
     };
 
     if (model.Cats()->HasCategorical() && !p_fmat->Cats()->Empty()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model);
+      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
       launch(acc);
     } else {
       launch(NoOpAccessor{});
@@ -851,7 +842,7 @@ class CPUPredictor : public Predictor {
     if constexpr (std::is_same_v<Adapter, data::ColumnarAdapter>) {
       // Make specialization for DataFrame where we need encoding.
       if (model.Cats()->HasCategorical()) {
-        auto [acc, mapping] = MakeCatAccessor(ctx_, m->Cats(), model);
+        auto [acc, mapping] = MakeCatAccessor(ctx_, m->Cats(), model.Cats());
         return launch(acc);
       }
     }
@@ -910,6 +901,7 @@ class CPUPredictor : public Predictor {
 
     auto launch = [&](SparsePage const &page, auto &&acc) {
       using Enc = std::remove_reference_t<decltype(acc)>;  // The encoder.
+      auto view_impl = page.GetView();
       common::ParallelFor(page.Size(), n_threads, [&](auto i) {
         auto tid = omp_get_thread_num();
         auto ridx = static_cast<bst_idx_t>(page.base_rowid + i);
@@ -917,7 +909,7 @@ class CPUPredictor : public Predictor {
         if (feats.Size() == 0) {
           feats.Init(n_features);
         }
-        SparsePageView view{&page, std::forward<Enc>(acc)};
+        SparsePageView view{view_impl, page.base_rowid, std::forward<Enc>(acc)};
         view.Fill(i, &feats);
 
         for (bst_tree_t j = 0; j < ntree_limit; ++j) {
@@ -939,7 +931,7 @@ class CPUPredictor : public Predictor {
     for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
       // parallel over local batch
       if (model.Cats()->HasCategorical() && !p_fmat->Cats()->Empty()) {
-        auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model);
+        auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
         launch(batch, std::move(acc));
       } else {
         launch(batch, NoOpAccessor{});
@@ -986,14 +978,15 @@ class CPUPredictor : public Predictor {
         }
       } else {
         for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-          PredictContributionKernel(SparsePageView{&batch, std::forward<Enc>(acc)}, info, model,
-                                    tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit,
-                                    approximate, condition, condition_feature);
+          PredictContributionKernel(
+              SparsePageView{batch.GetView(), batch.base_rowid, std::forward<Enc>(acc)}, info,
+              model, tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit, approximate,
+              condition, condition_feature);
         }
       }
     };
     if (model.Cats()->HasCategorical() && !p_fmat->CatsShared()->Empty()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model);
+      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
       launch(acc);
     } else {
       launch(NoOpAccessor{});
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 8c54662f44ee..bad16ea3b7f2 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -22,7 +22,6 @@
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
-#include "../encoder/ordinal.cuh"  // for CudaCategoryRecoder
 #include "../gbm/gbtree_model.h"
 #include "predict_fn.h"
 #include "xgboost/data.h"
@@ -892,17 +891,7 @@ class ColumnSplitHelper {
   Context const* ctx_;
 };
 
-auto MakeCatAccessor(Context const* ctx, enc::DeviceColumnsView const& new_enc,
-                     DeviceModel const& model) {
-  dh::DeviceUVector<std::int32_t> mapping(new_enc.n_total_cats);
-  auto d_sorted_idx = model.cat_enc->RefSortedIndex(ctx);
-  auto orig_enc = model.cat_enc->DeviceView(ctx);
-  enc::Recode(cuda_impl::EncPolicy, orig_enc, d_sorted_idx, new_enc, dh::ToSpan(mapping));
-  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
-  auto cats_mapping = enc::MappingView{new_enc.feature_segments, dh::ToSpan(mapping)};
-  auto acc = CatAccessor{cats_mapping};
-  return std::tuple{acc, std::move(mapping)};
-}
+using cuda_impl::MakeCatAccessor;
 
 template <typename EncAccessor>
 struct ShapSparsePageView {
@@ -924,7 +913,7 @@ void LaunchPredictKernel(Context const* ctx, bool is_dense, enc::DeviceColumnsVi
   if (is_dense) {
     auto is_dense = std::true_type{};
     if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model);
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
       launch(is_dense, std::move(acc));
     } else {
       launch(is_dense, NoOpAccessor{});
@@ -932,7 +921,7 @@ void LaunchPredictKernel(Context const* ctx, bool is_dense, enc::DeviceColumnsVi
   } else {
     auto is_dense = std::false_type{};
     if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model);
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
       launch(is_dense, std::move(acc));
     } else {
       launch(is_dense, NoOpAccessor{});
@@ -1033,7 +1022,7 @@ template <typename Kernel>
 void LaunchShapKernel(Context const* ctx, enc::DeviceColumnsView const& new_enc,
                       DeviceModel const& model, Kernel launch) {
   if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-    auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model);
+    auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
     launch(std::move(acc));
   } else {
     launch(NoOpAccessor{});
@@ -1152,10 +1141,7 @@ class GPUPredictor : public xgboost::Predictor {
 
     if constexpr (std::is_same_v<Adapter, data::CudfAdapter>) {
       if (m->HasCategorical()) {
-        // FIXME(jiamingy): Remove this container construction once cuDF can return device
-        // arrow.
-        auto container = std::make_shared<CatContainer>(m->Device(), m->Cats());
-        auto new_enc = container->DeviceView(this->ctx_);
+        auto new_enc = m->DCats();
         cfg.LaunchPredict<PartialLoader<BatchT>::template Type>(
             this->ctx_, m->Value(), missing, n_samples, n_features, d_model, false, new_enc, 0,
             &out_preds->predictions);
diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h
index 1b00add3e827..2cd8078ef4c3 100644
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -9,6 +9,7 @@
 
 #include "../common/categorical.h"  // for IsCat, Decision
 #include "../data/adapter.h"        // for COOTuple
+#include "../data/cat_container.h"  // for CatAccessor
 #include "xgboost/tree_model.h"     // for RegTree
 
 namespace xgboost::predictor {
@@ -65,46 +66,5 @@ inline bst_tree_t GetTreeLimit(std::vector<std::unique_ptr<RegTree>> const &tree
   }
   return ntree_limit;
 }
-
-/**
- * @brief Accessor for obtaining re-coded categories.
- */
-struct CatAccessor {
-  enc::MappingView enc;
-
-  template <typename T, typename Fidx>
-  [[nodiscard]] XGBOOST_DEVICE T operator()(T fvalue, Fidx f_idx) const {
-    if (!enc.Empty() && !enc[f_idx].empty()) {
-      auto f_mapping = enc[f_idx];
-      auto cat_idx = common::AsCat(fvalue);
-      if (cat_idx >= 0 && cat_idx < common::AsCat(f_mapping.size())) {
-        fvalue = f_mapping.data()[cat_idx];
-      }
-    }
-    return fvalue;
-  }
-
-  [[nodiscard]] XGBOOST_DEVICE float operator()(data::COOTuple const &e) const {
-    return this->operator()(e.value, e.column_idx);
-  }
-
-  [[nodiscard]] XGBOOST_DEVICE float operator()(Entry const &e) const {
-    return this->operator()(e.fvalue, e.index);
-  }
-};
-
-/**
- * @brief No-op accessor used to handle numeric data.
- */
-struct NoOpAccessor {
-  XGBOOST_DEVICE explicit NoOpAccessor(enc::MappingView const &) {}
-  NoOpAccessor() = default;
-  template <typename T, typename Fidx>
-  [[nodiscard]] XGBOOST_DEVICE T operator()(T fvalue, Fidx) const {
-    return fvalue;
-  }
-  [[nodiscard]] XGBOOST_DEVICE float operator()(data::COOTuple const &e) const { return e.value; }
-  [[nodiscard]] XGBOOST_DEVICE float operator()(Entry const &e) const { return e.fvalue; }
-};
 }  // namespace xgboost::predictor
 #endif  // XGBOOST_PREDICTOR_PREDICT_FN_H_
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 1f323085e1dc..61d221033d51 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -302,11 +302,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
     agent.Numerical(&best_split);
   }
 
-#if CUB_VERSION >= 300000
   __syncthreads();
-#else
-  cub::CTA_SYNC();
-#endif
   if (threadIdx.x == 0) {
     // Record best loss for each feature
     out_candidates[blockIdx.x] = best_split;
diff --git a/tests/cpp/data/test_cat_container.h b/tests/cpp/data/test_cat_container.h
index 6f13b1d244fe..c8315516ebae 100644
--- a/tests/cpp/data/test_cat_container.h
+++ b/tests/cpp/data/test_cat_container.h
@@ -29,7 +29,7 @@ inline void DeviceCheck(CatContainer const& cats) {
 
 #if defined(XGBOOST_USE_CUDA)
 [[nodiscard]] inline CatContainer FromDf(Context const* ctx, enc::DeviceColumnsView df) {
-  return CatContainer{ctx->Device(), df};
+  return CatContainer{ctx, df};
 }
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace test_cat_detail
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 356ec62651ab..083b209941bd 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -90,14 +90,26 @@ void CompareJSON(Json l, Json r) {
     CompareIntArray<I16Array>(l, r);
     break;
   }
+  case Value::ValueKind::kU16Array: {
+    CompareIntArray<U16Array>(l, r);
+    break;
+  }
   case Value::ValueKind::kI32Array: {
     CompareIntArray<I32Array>(l, r);
     break;
   }
+  case Value::ValueKind::kU32Array: {
+    CompareIntArray<U32Array>(l, r);
+    break;
+  }
   case Value::ValueKind::kI64Array: {
     CompareIntArray<I64Array>(l, r);
     break;
   }
+  case Value::ValueKind::kU64Array: {
+    CompareIntArray<U64Array>(l, r);
+    break;
+  }
   case Value::ValueKind::kBoolean: {
     ASSERT_EQ(l, r);
     break;
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 4749970f58c9..a407e487b182 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -12,6 +12,7 @@
     check_categorical_strings,
     check_ref_quantile_cut,
 )
+from xgboost.testing.utils import predictor_equal
 
 sys.path.append("tests/python")
 import test_quantile_dmatrix as tqd
@@ -70,7 +71,7 @@ def test_initialization(self, on_device: bool, device: str) -> None:
         )
         # query cuts from GIDX/Ellpack
         qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
-        tm.predictor_equal(Xy, qXy)
+        predictor_equal(Xy, qXy)
         with pytest.raises(ValueError, match="Inconsistent"):
             # max_bin changed.
             xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin - 1, ref=Xy)
@@ -92,7 +93,7 @@ def test_initialization(self, on_device: bool, device: str) -> None:
         )
         # query cuts from GIDX/Ellpack
         qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
-        tm.predictor_equal(Xy, qXy)
+        predictor_equal(Xy, qXy)
         with pytest.raises(ValueError, match="Inconsistent"):
             # max_bin changed.
             xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin - 1, ref=Xy)
@@ -247,7 +248,7 @@ def test_ltr(self) -> None:
         from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
         from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
 
-        assert tm.predictor_equal(from_qdm, from_dm)
+        assert predictor_equal(from_qdm, from_dm)
 
     @pytest.mark.skipif(**tm.no_cupy())
     def test_check_inf(self) -> None:
diff --git a/tests/python-gpu/test_gpu_ordinal.py b/tests/python-gpu/test_gpu_ordinal.py
index 909a05a8137b..e091ffc80773 100644
--- a/tests/python-gpu/test_gpu_ordinal.py
+++ b/tests/python-gpu/test_gpu_ordinal.py
@@ -9,6 +9,7 @@
 from xgboost import testing as tm
 from xgboost.testing.data import make_categorical
 from xgboost.testing.ordinal import (
+    run_basic_predict,
     run_cat_container,
     run_cat_container_iter,
     run_cat_container_mixed,
@@ -17,7 +18,11 @@
     run_cat_predict,
     run_cat_shap,
     run_cat_thread_safety,
+    run_recode_dmatrix,
+    run_recode_dmatrix_predict,
     run_specified_cat,
+    run_training_continuation,
+    run_update,
     run_validation,
 )
 
@@ -103,9 +108,31 @@ def run_gpu_cpu(DMatrixT: Type) -> bool:
         assert f.result()
 
 
-def test_spcified_cat() -> None:
+@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
+def test_mixed_devices_types(DMatrixT: Type) -> None:
+    run_basic_predict(DMatrixT, "cuda", "cpu")
+    run_basic_predict(DMatrixT, "cpu", "cuda")
+
+
+def test_specified_cat() -> None:
     run_specified_cat("cuda")
 
 
 def test_validation() -> None:
     run_validation("cuda")
+
+
+def test_recode_dmatrix() -> None:
+    run_recode_dmatrix("cuda")
+
+
+def test_training_continuation() -> None:
+    run_training_continuation("cuda")
+
+
+def test_update() -> None:
+    run_update("cuda")
+
+
+def test_recode_dmatrix_predict() -> None:
+    run_recode_dmatrix_predict("cuda")
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index fe60d9bfde50..76040b0aea9c 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -13,6 +13,7 @@
 from xgboost import testing as tm
 from xgboost.core import DataSplitMode
 from xgboost.testing.data import np_dtypes, run_base_margin_info
+from xgboost.testing.utils import predictor_equal
 
 dpath = "demo/data/"
 rng = np.random.RandomState(1994)
@@ -397,7 +398,7 @@ def test_dtypes(self) -> None:
         for orig, x in np_dtypes(n_samples, n_features):
             m0 = xgb.DMatrix(orig)
             m1 = xgb.DMatrix(x)
-            assert tm.predictor_equal(m0, m1)
+            assert predictor_equal(m0, m1)
 
 
 @pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows")
diff --git a/tests/python/test_ordinal.py b/tests/python/test_ordinal.py
index d958ae186186..3e76a8b37eee 100644
--- a/tests/python/test_ordinal.py
+++ b/tests/python/test_ordinal.py
@@ -10,7 +10,11 @@
     run_cat_predict,
     run_cat_shap,
     run_cat_thread_safety,
+    run_recode_dmatrix,
+    run_recode_dmatrix_predict,
     run_specified_cat,
+    run_training_continuation,
+    run_update,
     run_validation,
 )
 
@@ -49,9 +53,25 @@ def test_cat_leaf() -> None:
     run_cat_leaf("cpu")
 
 
-def test_spcified_cat() -> None:
+def test_specified_cat() -> None:
     run_specified_cat("cpu")
 
 
 def test_validation() -> None:
     run_validation("cpu")
+
+
+def test_recode_dmatrix() -> None:
+    run_recode_dmatrix("cpu")
+
+
+def test_training_continuation() -> None:
+    run_training_continuation("cpu")
+
+
+def test_update() -> None:
+    run_update("cpu")
+
+
+def test_recode_dmatrix_predict() -> None:
+    run_recode_dmatrix_predict("cpu")
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 56bd9fb38514..0abef50feeb7 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -13,7 +13,6 @@
     make_categorical,
     make_ltr,
     make_sparse_regression,
-    predictor_equal,
 )
 from xgboost.testing.data import check_inf, np_dtypes
 from xgboost.testing.data_iter import run_mixed_sparsity
@@ -21,6 +20,7 @@
     check_categorical_strings,
     check_ref_quantile_cut,
 )
+from xgboost.testing.utils import predictor_equal
 
 
 class TestQuantileDMatrix:
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index 8c30bb354b7e..c8aee49b6c89 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -7,6 +7,7 @@
 from xgboost import testing as tm
 from xgboost.core import DataSplitMode
 from xgboost.testing.data import pd_arrow_dtypes, pd_dtypes, run_base_margin_info
+from xgboost.testing.utils import predictor_equal
 
 try:
     import pandas as pd
@@ -482,9 +483,9 @@ def test_nullable_type(self, DMatrixT) -> None:
             if hasattr(orig.dtypes, "__iter__") and any(
                 dtype == "bool" for dtype in orig.dtypes
             ):
-                assert not tm.predictor_equal(m_orig, m_etype)
+                assert not predictor_equal(m_orig, m_etype)
             else:
-                assert tm.predictor_equal(m_orig, m_etype)
+                assert predictor_equal(m_orig, m_etype)
 
             np.testing.assert_allclose(m_orig.get_label(), m_etype.get_label())
             np.testing.assert_allclose(m_etype.get_label(), y.values.astype(np.float32))
@@ -511,7 +512,7 @@ def test_pyarrow_type(self, DMatrixT: Type[xgb.DMatrix]) -> None:
             m_orig = DMatrixT(orig, enable_categorical=True, label=y_orig)
             m_etype = DMatrixT(df, enable_categorical=True, label=y)
 
-            assert tm.predictor_equal(m_orig, m_etype)
+            assert predictor_equal(m_orig, m_etype)
             if y is not None:
                 np.testing.assert_allclose(m_orig.get_label(), m_etype.get_label())
                 np.testing.assert_allclose(m_etype.get_label(), y.values)
diff --git a/tests/python/test_with_polars.py b/tests/python/test_with_polars.py
index 4afcc06fd792..6f0bb392ae73 100644
--- a/tests/python/test_with_polars.py
+++ b/tests/python/test_with_polars.py
@@ -147,20 +147,22 @@ def test_categorical() -> None:
         xgb.DMatrix(df)
 
     data = xgb.DMatrix(df, enable_categorical=True)
-    categories = data.get_categories()
-    assert categories is not None
-    assert categories["f0"] is None
-    assert categories["f1"].to_pylist() == cats[:4]
+    categories = data.get_categories(export_to_arrow=True)
+    assert dict(categories.to_arrow())["f0"] is None
+    f1 = dict(categories.to_arrow())["f1"]
+    assert f1 is not None
+    assert f1.to_pylist() == cats[:4]
 
     df = pl.DataFrame(
         {"f0": [1, 3, 2, 4, 4], "f1": cats},
         schema=[("f0", pl.Int64()), ("f1", pl.Enum(cats[:4]))],
     )
     data = xgb.DMatrix(df, enable_categorical=True)
-    categories = data.get_categories()
-    assert categories is not None
-    assert categories["f0"] is None
-    assert categories["f1"].to_pylist() == cats[:4]
+    categories = data.get_categories(export_to_arrow=True)
+    assert dict(categories.to_arrow())["f0"] is None
+    f1 = dict(categories.to_arrow())["f1"]
+    assert f1 is not None
+    assert f1.to_pylist() == cats[:4]
 
     rng = np.random.default_rng(2025)
     y = rng.normal(size=(df.shape[0]))
diff --git a/tests/python/test_with_scipy.py b/tests/python/test_with_scipy.py
index ab54d2a43f90..3990c4b0580e 100644
--- a/tests/python/test_with_scipy.py
+++ b/tests/python/test_with_scipy.py
@@ -7,7 +7,7 @@
 import scipy.sparse
 
 import xgboost as xgb
-from xgboost import testing as tm
+from xgboost.testing.utils import predictor_equal
 
 
 @pytest.mark.filterwarnings("error")
@@ -59,7 +59,7 @@ def test_csc(DMatrixT: Type[xgb.DMatrix], CSC: Type) -> None:
         data = np.array([0, 1, 2, 3, 4])
         row_idx = np.array([0, 1, 2, 0, 2])
         X = CSC((data, row_idx, indptr), shape=(3, 2))
-        assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))
+        assert predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))
 
 
 @pytest.mark.filterwarnings("error")
@@ -84,4 +84,4 @@ def test_coo(DMatrixT: Type[xgb.DMatrix], COO: Type) -> None:
         assert dtrain.num_col() == 3
         assert dtrain.num_nonmissing() == data.size
 
-        assert tm.predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))
+        assert predictor_equal(DMatrixT(X.tocsr()), DMatrixT(X))

From e4406daa58a15c4a08521e630a822bb70f1c366d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 3 Aug 2025 14:11:50 +0800
Subject: [PATCH 123/224] [enc] Support training continuation with sklearn.
 (#11605)

---
 python-package/xgboost/_data_utils.py      |  10 +-
 python-package/xgboost/compat.py           | 115 ++++++++++++++++++---
 python-package/xgboost/core.py             |  30 ++++--
 python-package/xgboost/dask/__init__.py    |  12 ++-
 python-package/xgboost/data.py             | 111 ++++++--------------
 python-package/xgboost/sklearn.py          |  83 ++++++++++++---
 python-package/xgboost/testing/data.py     |  17 +--
 python-package/xgboost/testing/ordinal.py  |   9 +-
 python-package/xgboost/testing/with_skl.py |  34 +++++-
 python-package/xgboost/training.py         |  20 +++-
 src/c_api/c_api.cc                         |  56 +++++++---
 src/data/adapter.h                         |   2 +-
 src/data/cat_container.cc                  |   3 +-
 src/data/cat_container.cu                  |   4 +-
 src/data/cat_container.h                   |   6 +-
 src/data/data.cc                           |   4 +-
 src/data/device_adapter.cuh                |   2 +-
 src/data/proxy_dmatrix.cc                  |  22 +++-
 src/data/proxy_dmatrix.cu                  |  10 ++
 src/data/proxy_dmatrix.h                   |   2 +
 src/data/quantile_dmatrix.cc               |   3 +-
 src/data/quantile_dmatrix.cu               |   3 +-
 src/data/simple_dmatrix.cc                 |   4 +-
 src/data/simple_dmatrix.cu                 |   4 +-
 src/data/sparse_page_dmatrix.cc            |   6 +-
 src/encoder/ordinal.cuh                    |   2 +-
 src/encoder/ordinal.h                      |   2 +-
 src/predictor/cpu_predictor.cc             |   8 +-
 src/predictor/gpu_predictor.cu             |  13 ++-
 tests/cpp/data/test_cat_container.h        |   4 +-
 tests/python-gpu/test_from_cudf.py         |   7 ++
 tests/python-gpu/test_gpu_prediction.py    |  11 +-
 tests/python-gpu/test_gpu_with_sklearn.py  |   6 ++
 tests/python/test_with_arrow.py            |   9 ++
 tests/python/test_with_modin.py            |   7 ++
 tests/python/test_with_pandas.py           |   9 ++
 tests/python/test_with_polars.py           |   7 ++
 tests/python/test_with_sklearn.py          |   7 +-
 38 files changed, 465 insertions(+), 199 deletions(-)

diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
index fcd27af494df..a6258be69b2f 100644
--- a/python-package/xgboost/_data_utils.py
+++ b/python-package/xgboost/_data_utils.py
@@ -397,7 +397,7 @@ def array_interface_dict(data: np.ndarray) -> ArrayInf:
     return cast(ArrayInf, ainf)
 
 
-def pd_cats_inf(  # pylint: disable=too-many-locals
+def pd_cat_inf(  # pylint: disable=too-many-locals
     cats: DfCatAccessor, codes: "pd.Series"
 ) -> Tuple[Union[StringArray, ArrayInf], ArrayInf, Tuple]:
     """Get the array interface representation of pandas category accessor."""
@@ -665,12 +665,18 @@ def to_arrow(self) -> ArrowCatList:
             )
         return self._arrow_arrays
 
+    def empty(self) -> bool:
+        """Returns True if there's no category."""
+        return self._handle.value is None
+
     def get_handle(self) -> int:
         """Internal method for retrieving the handle."""
         assert self._handle.value
         return self._handle.value
 
     def __del__(self) -> None:
+        if self._handle.value is None:
+            return
         self._free()
 
 
@@ -718,7 +724,7 @@ class TransformedDf(ABC):
 
     def __init__(self, ref_categories: Optional[Categories], aitfs: AifType) -> None:
         self.ref_categories = ref_categories
-        if ref_categories is not None:
+        if ref_categories is not None and ref_categories.get_handle() is not None:
             aif = ref_categories.get_handle()
             self.ref_aif: Optional[int] = aif
         else:
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 34939805d053..87c3bffb8bb5 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -5,11 +5,15 @@
 import logging
 import sys
 import types
-from typing import Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, Sequence, TypeGuard, cast
 
 import numpy as np
 
-from ._typing import _T
+from ._typing import _T, DataType
+
+if TYPE_CHECKING:
+    import pandas as pd
+    import pyarrow as pa
 
 assert sys.version_info[0] == 3, "Python 2 is no longer supported."
 
@@ -31,17 +35,6 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     return is_same_module and has_same_name
 
 
-# pandas
-try:
-    from pandas import DataFrame, Series
-
-    PANDAS_INSTALLED = True
-except ImportError:
-    DataFrame = object
-    Series = object
-    PANDAS_INSTALLED = False
-
-
 # sklearn
 try:
     from sklearn import __version__ as _sklearn_version
@@ -139,6 +132,14 @@ def import_pyarrow() -> types.ModuleType:
     return pa
 
 
+@functools.cache
+def import_pandas() -> types.ModuleType:
+    """Import pandas with memory cache."""
+    import pandas as pd
+
+    return pd
+
+
 @functools.cache
 def import_polars() -> types.ModuleType:
     """Import polars with memory cache."""
@@ -147,6 +148,14 @@ def import_polars() -> types.ModuleType:
     return pl
 
 
+@functools.cache
+def is_pandas_available() -> bool:
+    """Check the pandas package is available or not."""
+    if importlib.util.find_spec("pandas") is None:
+        return False
+    return True
+
+
 try:
     import scipy.sparse as scipy_sparse
     from scipy.sparse import csr_matrix as scipy_csr
@@ -155,6 +164,84 @@ def import_polars() -> types.ModuleType:
     scipy_csr = object
 
 
+def _is_polars_lazyframe(data: DataType) -> bool:
+    return lazy_isinstance(data, "polars.lazyframe.frame", "LazyFrame")
+
+
+def _is_polars_series(data: DataType) -> bool:
+    return lazy_isinstance(data, "polars.series.series", "Series")
+
+
+def _is_polars(data: DataType) -> bool:
+    lf = _is_polars_lazyframe(data)
+    df = lazy_isinstance(data, "polars.dataframe.frame", "DataFrame")
+    return lf or df
+
+
+def _is_arrow(data: DataType) -> TypeGuard["pa.Table"]:
+    return lazy_isinstance(data, "pyarrow.lib", "Table")
+
+
+def _is_cudf_df(data: DataType) -> bool:
+    return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
+
+
+def _is_cudf_ser(data: DataType) -> bool:
+    return lazy_isinstance(data, "cudf.core.series", "Series")
+
+
+def _is_cudf_pandas(data: DataType) -> bool:
+    """Must go before both pandas and cudf checks."""
+    return (_is_pandas_df(data) or _is_pandas_series(data)) and lazy_isinstance(
+        type(data), "cudf.pandas.fast_slow_proxy", "_FastSlowProxyMeta"
+    )
+
+
+def _is_pandas_df(data: DataType) -> TypeGuard["pd.DataFrame"]:
+    return lazy_isinstance(data, "pandas.core.frame", "DataFrame")
+
+
+def _is_pandas_series(data: DataType) -> TypeGuard["pd.Series"]:
+    return lazy_isinstance(data, "pandas.core.series", "Series")
+
+
+def _is_modin_df(data: DataType) -> bool:
+    return lazy_isinstance(data, "modin.pandas.dataframe", "DataFrame")
+
+
+def _is_modin_series(data: DataType) -> bool:
+    return lazy_isinstance(data, "modin.pandas.series", "Series")
+
+
+def is_dataframe(data: DataType) -> bool:
+    """Whether the input is a dataframe. Currently supported dataframes:
+
+    - pandas
+    - cudf
+    - cudf.pandas
+    - polars
+    - pyarrow
+    - modin
+
+
+    """
+    return any(
+        p(data)
+        for p in (
+            _is_polars,
+            _is_polars_series,
+            _is_arrow,
+            _is_cudf_df,
+            _is_cudf_ser,
+            _is_cudf_pandas,
+            _is_pandas_df,
+            _is_pandas_series,
+            _is_modin_df,
+            _is_modin_series,
+        )
+    )
+
+
 def concat(value: Sequence[_T]) -> _T:  # pylint: disable=too-many-return-statements
     """Concatenate row-wise."""
     if isinstance(value[0], np.ndarray):
@@ -167,7 +254,7 @@ def concat(value: Sequence[_T]) -> _T:  # pylint: disable=too-many-return-statem
     if scipy_sparse and isinstance(value[0], scipy_sparse.spmatrix):
         # other sparse format will be converted to CSR.
         return scipy_sparse.vstack(value, format="csr")
-    if PANDAS_INSTALLED and isinstance(value[0], (DataFrame, Series)):
+    if _is_pandas_df(value[0]) or _is_pandas_series(value[0]):
         from pandas import concat as pd_concat
 
         return pd_concat(value, axis=0)
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index ac1bd2d5dbe0..912768393724 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1,6 +1,7 @@
 # pylint: disable=too-many-arguments, too-many-branches, invalid-name
 # pylint: disable=too-many-lines, too-many-locals
 """Core XGBoost Library."""
+
 import copy
 import ctypes
 import json
@@ -69,15 +70,17 @@
     c_bst_ulong,
 )
 from .compat import (
-    PANDAS_INSTALLED,
-    DataFrame,
     import_polars,
     import_pyarrow,
+    is_pandas_available,
     is_pyarrow_available,
     py_str,
 )
 from .libpath import find_lib_path, is_sphinx_build
 
+if TYPE_CHECKING:
+    from pandas import DataFrame as PdDataFrame
+
 
 class XGBoostError(ValueError):
     """Error thrown by xgboost trainer."""
@@ -782,7 +785,7 @@ def _get_categories(
     cfn: Callable[[ctypes.c_char_p], int],
     feature_names: FeatureNames,
     n_features: int,
-) -> Optional[ArrowCatList]:
+) -> ArrowCatList:
     if not is_pyarrow_available():
         raise ImportError(
             "`pyarrow` is required for exporting categories to arrow arrays."
@@ -797,7 +800,9 @@ def _get_categories(
 
     ret = ctypes.c_char_p()
     _check_call(cfn(ret))
-    assert ret.value is not None
+    if ret.value is None:
+        results = [(feature_names[i], None) for i in range(n_features)]
+        return results
 
     retstr = ret.value.decode()  # pylint: disable=no-member
     jcats = json.loads(retstr)
@@ -3201,7 +3206,8 @@ def get_score(
         """Get feature importance of each feature.
         For tree model Importance type can be defined as:
 
-        * 'weight': the number of times a feature is used to split the data across all trees.
+        * 'weight': the number of times a feature is used to split the data across all
+           trees.
         * 'gain': the average gain across all splits the feature is used in.
         * 'cover': the average coverage across all splits the feature is used in.
         * 'total_gain': the total gain across all splits the feature is used in.
@@ -3261,7 +3267,7 @@ def get_score(
         return results
 
     # pylint: disable=too-many-statements
-    def trees_to_dataframe(self, fmap: PathLike = "") -> DataFrame:
+    def trees_to_dataframe(self, fmap: PathLike = "") -> "PdDataFrame":
         """Parse a boosted tree model text dump into a pandas DataFrame structure.
 
         This feature is only defined when the decision tree model is chosen as base
@@ -3274,8 +3280,10 @@ def trees_to_dataframe(self, fmap: PathLike = "") -> DataFrame:
            The name of feature map file.
         """
         # pylint: disable=too-many-locals
+        from pandas import DataFrame
+
         fmap = os.fspath(os.path.expanduser(fmap))
-        if not PANDAS_INSTALLED:
+        if not is_pandas_available():
             raise ImportError(
                 (
                     "pandas must be available to use this method."
@@ -3426,7 +3434,7 @@ def get_split_value_histogram(
         fmap: PathLike = "",
         bins: Optional[int] = None,
         as_pandas: bool = True,
-    ) -> Union[np.ndarray, DataFrame]:
+    ) -> Union[np.ndarray, "PdDataFrame"]:
         """Get split value histogram of a feature
 
         Parameters
@@ -3482,9 +3490,11 @@ def get_split_value_histogram(
                     "Split value historgam doesn't support categorical split."
                 )
 
-        if as_pandas and PANDAS_INSTALLED:
+        if as_pandas and is_pandas_available():
+            from pandas import DataFrame
+
             return DataFrame(nph_stacked, columns=["SplitValue", "Count"])
-        if as_pandas and not PANDAS_INSTALLED:
+        if as_pandas and not is_pandas_available():
             warnings.warn(
                 "Returning histogram as ndarray"
                 " (as_pandas == True, but pandas is not installed).",
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 7e04ffe9e5fe..53bd112a0403 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -94,7 +94,7 @@
 from ..collective import Config as CollConfig
 from ..collective import _Args as CollArgs
 from ..collective import _ArgVals as CollArgsVals
-from ..compat import DataFrame, lazy_isinstance
+from ..compat import _is_cudf_df
 from ..core import (
     Booster,
     DMatrix,
@@ -942,7 +942,7 @@ def _maybe_dataframe(
         # In older versions of dask, the partition is actually a numpy array when input
         # is dataframe.
         index = getattr(data, "index", None)
-        if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
+        if _is_cudf_df(data):
             import cudf
 
             if prediction.size == 0:
@@ -952,10 +952,14 @@ def _maybe_dataframe(
                 prediction, columns=columns, dtype=numpy.float32, index=index
             )
         else:
+            import pandas as pd
+
             if prediction.size == 0:
-                return DataFrame({}, columns=columns, dtype=numpy.float32, index=index)
+                return pd.DataFrame(
+                    {}, columns=columns, dtype=numpy.float32, index=index
+                )
 
-            prediction = DataFrame(
+            prediction = pd.DataFrame(
                 prediction, columns=columns, dtype=numpy.float32, index=index
             )
     return prediction
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 18ebc1579bd5..3eb0736be326 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -1,6 +1,7 @@
 # pylint: disable=too-many-arguments, too-many-branches, too-many-lines
 # pylint: disable=too-many-return-statements
 """Data dispatching for DMatrix."""
+
 import ctypes
 import functools
 import json
@@ -40,7 +41,7 @@
     cudf_cat_inf,
     get_ref_categories,
     is_arrow_dict,
-    pd_cats_inf,
+    pd_cat_inf,
 )
 from ._typing import (
     CupyT,
@@ -55,7 +56,18 @@
     c_bst_ulong,
 )
 from .compat import (
-    DataFrame,
+    _is_arrow,
+    _is_cudf_df,
+    _is_cudf_pandas,
+    _is_cudf_ser,
+    _is_modin_df,
+    _is_modin_series,
+    _is_pandas_df,
+    _is_pandas_series,
+    _is_polars,
+    _is_polars_lazyframe,
+    _is_polars_series,
+    import_pandas,
     import_polars,
     import_pyarrow,
     is_pyarrow_available,
@@ -74,6 +86,7 @@
 
 if TYPE_CHECKING:
     import pyarrow as pa
+    from pandas import DataFrame as PdDataFrame
     from pandas import Series as PdSeries
 
 
@@ -285,22 +298,6 @@ def _from_numpy_array(
     return handle, feature_names, feature_types
 
 
-def _is_pandas_df(data: DataType) -> TypeGuard[DataFrame]:
-    try:
-        import pandas as pd
-    except ImportError:
-        return False
-    return isinstance(data, pd.DataFrame)
-
-
-def _is_modin_df(data: DataType) -> bool:
-    try:
-        import modin.pandas as pd
-    except ImportError:
-        return False
-    return isinstance(data, pd.DataFrame)
-
-
 _pandas_dtype_mapper = {
     "int8": "int",
     "int16": "int",
@@ -376,14 +373,14 @@ def _invalid_dataframe_dtype(data: DataType) -> None:
 
 
 def pandas_feature_info(
-    data: DataFrame,
+    data: "PdDataFrame",
     meta: Optional[str],
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
 ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
     """Handle feature info for pandas dataframe."""
-    import pandas as pd
+    pd = import_pandas()
 
     # handle feature names
     if feature_names is None and meta is None:
@@ -448,7 +445,7 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
 
 @functools.cache
 def _lazy_load_pd_is_cat() -> Callable[[PandasDType], bool]:
-    import pandas as pd
+    pd = import_pandas()
 
     if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
         Version = pd.util.version.Version
@@ -472,7 +469,7 @@ def is_pd_cat_dtype(dtype: PandasDType) -> bool:
 
 @functools.cache
 def _lazy_load_pd_is_sparse() -> Callable[[PandasDType], bool]:
-    import pandas as pd
+    pd = import_pandas()
 
     if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
         Version = pd.util.version.Version
@@ -498,7 +495,7 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
 
 def pandas_pa_type(ser: Any) -> np.ndarray:
     """Handle pandas pyarrow extention."""
-    import pandas as pd
+    pd = import_pandas()
 
     if TYPE_CHECKING:
         import pyarrow as pa
@@ -539,7 +536,9 @@ def _lazy_load_pd_floats() -> tuple:
     return Float32Dtype, Float64Dtype
 
 
-def pandas_transform_data(data: DataFrame) -> List[Union[np.ndarray, DfCatAccessor]]:
+def pandas_transform_data(
+    data: "PdDataFrame",
+) -> List[Union[np.ndarray, DfCatAccessor]]:
     """Handle categorical dtype and extension types from pandas."""
     Float32Dtype, Float64Dtype = _lazy_load_pd_floats()
 
@@ -623,7 +622,7 @@ def __init__(
         for col in self.columns:
             if _is_df_cat(col):
                 # Categorical column
-                jnames, jcodes, buf = pd_cats_inf(col.categories, col.codes)
+                jnames, jcodes, buf = pd_cat_inf(col.categories, col.codes)
                 self.temporary_buffers.append(buf)
                 aitfs.append((jnames, jcodes))
             else:
@@ -650,7 +649,7 @@ def shape(self) -> Tuple[int, int]:
 
 
 def _transform_pandas_df(
-    data: DataFrame,
+    data: "PdDataFrame",
     enable_categorical: bool,
     feature_names: Optional[FeatureNames] = None,
     feature_types: Optional[Union[FeatureTypes, Categories]] = None,
@@ -690,7 +689,7 @@ def _meta_from_pandas_df(
 
 def _from_pandas_df(
     *,
-    data: DataFrame,
+    data: "PdDataFrame",
     enable_categorical: bool,
     missing: FloatCompatible,
     nthread: int,
@@ -715,14 +714,6 @@ def _from_pandas_df(
     return handle, feature_names, feature_types
 
 
-def _is_pandas_series(data: DataType) -> bool:
-    try:
-        import pandas as pd
-    except ImportError:
-        return False
-    return isinstance(data, pd.Series)
-
-
 def _meta_from_pandas_series(
     data: DataType, name: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p
 ) -> None:
@@ -740,14 +731,6 @@ def _meta_from_pandas_series(
     _meta_from_numpy(data, name, dtype, handle)
 
 
-def _is_modin_series(data: DataType) -> bool:
-    try:
-        import modin.pandas as pd
-    except ImportError:
-        return False
-    return isinstance(data, pd.Series)
-
-
 def _from_pandas_series(
     *,
     data: DataType,
@@ -817,10 +800,6 @@ def shape(self) -> Tuple[int, int]:
         return len(self.columns[0]), len(self.columns)
 
 
-def _is_arrow(data: DataType) -> bool:
-    return lazy_isinstance(data, "pyarrow.lib", "Table")
-
-
 def _transform_arrow_table(
     data: "pa.Table",
     enable_categorical: bool,
@@ -933,20 +912,6 @@ def _meta_from_arrow_table(
     _meta_from_pandas_df(table.to_pandas(), name=name, dtype=dtype, handle=handle)
 
 
-def _is_polars_lazyframe(data: DataType) -> bool:
-    return lazy_isinstance(data, "polars.lazyframe.frame", "LazyFrame")
-
-
-def _is_polars_series(data: DataType) -> bool:
-    return lazy_isinstance(data, "polars.series.series", "Series")
-
-
-def _is_polars(data: DataType) -> bool:
-    lf = _is_polars_lazyframe(data)
-    df = lazy_isinstance(data, "polars.dataframe.frame", "DataFrame")
-    return lf or df
-
-
 def _check_pyarrow_for_polars() -> None:
     if not is_pyarrow_available():
         raise ImportError("`pyarrow` is required for polars.")
@@ -1000,20 +965,6 @@ def _from_polars_df(  # pylint: disable=too-many-positional-arguments
     return handle, feature_names, feature_types
 
 
-def _is_cudf_df(data: DataType) -> bool:
-    return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
-
-
-def _is_cudf_pandas(data: DataType) -> bool:
-    """Must go before both pandas and cudf checks."""
-    return (
-        lazy_isinstance(data, "pandas.core.frame", "DataFrame")
-        or lazy_isinstance(data, "pandas.core.series", "Series")
-    ) and lazy_isinstance(
-        type(data), "cudf.pandas.fast_slow_proxy", "_FastSlowProxyMeta"
-    )
-
-
 @functools.cache
 def _lazy_load_cudf_is_cat() -> Callable[[Any], bool]:
     try:
@@ -1178,10 +1129,6 @@ def _from_cudf_df(
     return handle, feature_names, feature_types
 
 
-def _is_cudf_ser(data: DataType) -> bool:
-    return lazy_isinstance(data, "cudf.core.series", "Series")
-
-
 def _is_cupy_alike(data: DataType) -> bool:
     return hasattr(data, "__cuda_array_interface__")
 
@@ -1465,7 +1412,7 @@ def check_cats(
     if _is_cudf_pandas(data):
         data = data._fsproxy_fast  # pylint: disable=protected-access
     if _is_pandas_series(data):
-        import pandas as pd
+        pd = import_pandas()
 
         data = pd.DataFrame(data)
     if _is_pandas_df(data):
@@ -1498,7 +1445,7 @@ def check_cats(
         assert check_cats(feature_types)
         return _from_dlpack(data, missing, threads, feature_names, feature_types)
     if _is_modin_series(data):
-        import pandas as pd
+        pd = import_pandas()
 
         data = pd.DataFrame(data)
     if _is_modin_df(data):
@@ -1726,7 +1673,7 @@ def _proxy_transform(
         )
         return df_pl, feature_names, feature_types
     if _is_pandas_series(data):
-        import pandas as pd
+        pd = import_pandas()
 
         data = pd.DataFrame(data)
     if _is_arrow(data):
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index e8e4fdbd8311..8572da21392b 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1,5 +1,6 @@
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-lines
 """Scikit-Learn Wrapper interface for XGBoost."""
+
 import collections
 import copy
 import json
@@ -26,7 +27,15 @@
 import numpy as np
 from scipy.special import softmax
 
-from ._typing import ArrayLike, FeatureNames, FeatureTypes, IterationRange, ModelIn
+from ._data_utils import Categories
+from ._typing import (
+    ArrayLike,
+    EvalsLog,
+    FeatureNames,
+    FeatureTypes,
+    IterationRange,
+    ModelIn,
+)
 from .callback import TrainingCallback
 
 # Do not use class names on scikit-learn directly.  Re-define the classes on
@@ -39,6 +48,7 @@
     _sklearn_Tags,
     _sklearn_version,
     import_cupy,
+    is_dataframe,
 )
 from .config import config_context
 from .core import (
@@ -88,6 +98,25 @@ def _can_use_qdm(tree_method: Optional[str], device: Optional[str]) -> bool:
     return tree_method in ("hist", None, "auto") and not_sycl
 
 
+def get_ref_categories(
+    X: ArrayLike,
+    model: Optional[Union[Booster, str]],
+    feature_types: Optional[FeatureTypes],
+) -> Tuple[Optional[Union[Booster, str]], Optional[Union[FeatureTypes, Categories]]]:
+    """Configure the reference categories."""
+    if model is None or not is_dataframe(X):
+        return model, feature_types
+
+    if isinstance(model, str):
+        model = Booster(model_file=model)
+
+    categories = model.get_categories()
+    if not categories.empty():
+        return model, categories
+
+    return model, feature_types
+
+
 class _SklObjWProto(Protocol):
     def __call__(
         self,
@@ -622,7 +651,7 @@ def _wrap_evaluation_matrices(
     eval_qid: Optional[Sequence[Any]],
     create_dmatrix: Callable,
     enable_categorical: bool,
-    feature_types: Optional[FeatureTypes],
+    feature_types: Optional[Union[FeatureTypes, Categories]],
 ) -> Tuple[Any, List[Tuple[Any, str]]]:
     """Convert array_like evaluation matrices into DMatrix.  Perform validation on the
     way."""
@@ -641,6 +670,10 @@ def _wrap_evaluation_matrices(
     )
 
     n_validation = 0 if eval_set is None else len(eval_set)
+    if hasattr(train_dmatrix, "get_categories"):
+        Xy_cats = train_dmatrix.get_categories()
+    else:
+        Xy_cats = None
 
     def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
         if meta is None:
@@ -677,6 +710,14 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
             ):
                 evals.append(train_dmatrix)
             else:
+                categories = None
+                if not isinstance(feature_types, Categories) and is_dataframe(X):
+                    # No reference categories from a previous model, use the one in the
+                    # training DMatrix.
+                    categories = Xy_cats
+                if categories is not None and not categories.empty():
+                    feature_types = categories
+
                 m = create_dmatrix(
                     data=valid_X,
                     label=valid_y,
@@ -807,6 +848,11 @@ def __init__(
         self.validate_parameters = validate_parameters
         self.enable_categorical = enable_categorical
         self.feature_types = feature_types
+        if isinstance(self.feature_types, Categories):
+            raise TypeError(
+                "If you are training with a prior model (training continuation), "
+                "XGBoost can automatically reuse the categories from that model."
+            )
         self.feature_weights = feature_weights
         self.max_cat_to_onehot = max_cat_to_onehot
         self.max_cat_threshold = max_cat_threshold
@@ -1170,7 +1216,7 @@ def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
                 pass
         return DMatrix(**kwargs, nthread=self.n_jobs)
 
-    def _set_evaluation_result(self, evals_result: TrainingCallback.EvalsLog) -> None:
+    def _set_evaluation_result(self, evals_result: EvalsLog) -> None:
         if evals_result:
             self.evals_result_ = cast(Dict[str, Dict[str, List[float]]], evals_result)
 
@@ -1246,8 +1292,9 @@ def fit(
             model, metric, params, feature_weights = self._configure_fit(
                 xgb_model, params, feature_weights
             )
+            model, feature_types = get_ref_categories(X, model, self.feature_types)
 
-            evals_result: TrainingCallback.EvalsLog = {}
+            evals_result: EvalsLog = {}
             train_dmatrix, evals = _wrap_evaluation_matrices(
                 missing=self.missing,
                 X=X,
@@ -1264,7 +1311,7 @@ def fit(
                 eval_qid=None,
                 create_dmatrix=self._create_dmatrix,
                 enable_categorical=self.enable_categorical,
-                feature_types=self.feature_types,
+                feature_types=feature_types,
             )
 
             if callable(self.objective):
@@ -1291,9 +1338,7 @@ def fit(
             return self
 
     def _can_use_inplace_predict(self) -> bool:
-        if self.booster != "gblinear":
-            return True
-        return False
+        return self.booster != "gblinear"
 
     def _get_iteration_range(
         self, iteration_range: Optional[IterationRange]
@@ -1641,7 +1686,6 @@ def fit(
     ) -> "XGBClassifier":
         # pylint: disable = attribute-defined-outside-init,too-many-statements
         with config_context(verbosity=self.verbosity):
-            evals_result: TrainingCallback.EvalsLog = {}
             # We keep the n_classes_ as a simple member instead of loading it from
             # booster in a Python property. This way we can have efficient and
             # thread-safe prediction.
@@ -1690,6 +1734,9 @@ def fit(
             model, metric, params, feature_weights = self._configure_fit(
                 xgb_model, params, feature_weights
             )
+            model, feature_types = get_ref_categories(X, model, self.feature_types)
+
+            evals_result: EvalsLog = {}
             train_dmatrix, evals = _wrap_evaluation_matrices(
                 missing=self.missing,
                 X=X,
@@ -1706,7 +1753,7 @@ def fit(
                 eval_qid=None,
                 create_dmatrix=self._create_dmatrix,
                 enable_categorical=self.enable_categorical,
-                feature_types=self.feature_types,
+                feature_types=feature_types,
             )
 
             self._Booster = train(
@@ -2184,6 +2231,14 @@ def fit(
 
         """
         with config_context(verbosity=self.verbosity):
+            params = self.get_xgb_params()
+
+            model, metric, params, feature_weights = self._configure_fit(
+                xgb_model, params, feature_weights
+            )
+            model, feature_types = get_ref_categories(X, model, self.feature_types)
+
+            evals_result: EvalsLog = {}
             train_dmatrix, evals = _wrap_evaluation_matrices(
                 missing=self.missing,
                 X=X,
@@ -2200,15 +2255,9 @@ def fit(
                 eval_qid=eval_qid,
                 create_dmatrix=self._create_ltr_dmatrix,
                 enable_categorical=self.enable_categorical,
-                feature_types=self.feature_types,
+                feature_types=feature_types,
             )
 
-            evals_result: TrainingCallback.EvalsLog = {}
-            params = self.get_xgb_params()
-
-            model, metric, params, feature_weights = self._configure_fit(
-                xgb_model, params, feature_weights
-            )
             self._Booster = train(
                 params,
                 train_dmatrix,
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 376829a06f6a..1037a44c62c7 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -36,7 +36,7 @@
 from ..training import train as train_fn
 
 if TYPE_CHECKING:
-    from ..compat import DataFrame as DataFrameT
+    from pandas import DataFrame as DataFrameT
 else:
     DataFrameT = Any
 
@@ -994,23 +994,28 @@ def make_categorical(
     """
     pd = pytest.importorskip("pandas")
 
+    # Use different rngs for column and rows. We can change the `n_samples` without
+    # changing the column type.
     rng = np.random.RandomState(random_state)
+    row_rng = np.random.RandomState(random_state + 1)
 
     df = pd.DataFrame()
     for i in range(n_features):
         choice = rng.binomial(1, cat_ratio, size=1)[0]
         if choice == 1:
             if np.issubdtype(cat_dtype, np.str_):
+                # we rely on using the feature index as the seed to generate the same
+                # categories for multiple calls to `make_categorical`.
                 categories = np.array(unique_random_strings(n_categories, i))
-                c = rng.choice(categories, size=n_samples, replace=True)
+                c = row_rng.choice(categories, size=n_samples, replace=True)
             else:
                 categories = np.arange(0, n_categories)
-                c = rng.randint(low=0, high=n_categories, size=n_samples)
+                c = row_rng.randint(low=0, high=n_categories, size=n_samples)
 
             df[str(i)] = pd.Series(c, dtype="category")
             df[str(i)] = df[str(i)].cat.set_categories(categories)
         else:
-            num = rng.randint(low=0, high=n_categories, size=n_samples)
+            num = row_rng.randint(low=0, high=n_categories, size=n_samples)
             df[str(i)] = pd.Series(num, dtype=num.dtype)
 
     label = np.zeros(shape=(n_samples,))
@@ -1023,7 +1028,7 @@ def make_categorical(
 
     if sparsity > 0.0:
         for i in range(n_features):
-            index = rng.randint(
+            index = row_rng.randint(
                 low=0, high=n_samples - 1, size=int(n_samples * sparsity)
             )
             df.iloc[index, i] = np.nan
@@ -1036,7 +1041,7 @@ def make_categorical(
 
     if shuffle:
         columns = list(df.columns)
-        rng.shuffle(columns)
+        row_rng.shuffle(columns)
         df = df[columns]
 
     if device != "cpu":
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 277d7696f2b2..48bef6192251 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -74,6 +74,8 @@ def run_dispatch(device: Device, DMatrixT: Type) -> None:
         categories = df.c.cat.categories
 
         Xy = DMatrixT(df, enable_categorical=True)
+        assert Xy.feature_names == ["c"]
+        assert Xy.feature_types == ["c"]
         results = Xy.get_categories(export_to_arrow=True).to_arrow()
         assert results is not None
         results_di = dict(results)
@@ -503,7 +505,7 @@ def run_cat_leaf(device: Device) -> None:
 
 # pylint: disable=too-many-locals
 @memory.cache
-def make_recoded(device: Device) -> Tuple:
+def make_recoded(device: Device, *, n_features: int = 4096) -> Tuple:
     """Synthesize a test dataset with changed encoding."""
     Df, _ = get_df_impl(device)
 
@@ -511,7 +513,6 @@ def make_recoded(device: Device) -> Tuple:
 
     # Test large column numbers. XGBoost makes some specializations for slim datasets,
     # make sure we cover all the cases.
-    n_features = 4096
     n_samples = 1024
 
     # Same between old and new, with 0 ("a") and 1 ("b") exchanged their position.
@@ -663,6 +664,7 @@ def run_recode_dmatrix(device: Device) -> None:
 
     Xy = DMatrix(df, enable_categorical=True)
     cats_0 = Xy.get_categories(export_to_arrow=True)
+    assert Xy.feature_types == ["int", "c"]
 
     col1 = pd.Categorical.from_codes(
         # b, b, c, d, a, c, c, d, a
@@ -671,6 +673,9 @@ def run_recode_dmatrix(device: Device) -> None:
     )
     df = Df({"f0": col0, "f1": col1})
     Xy = DMatrix(df, enable_categorical=True, feature_types=cats_0)
+    # feature_types is still correct
+    assert Xy.feature_names == ["f0", "f1"]
+    assert Xy.feature_types == ["int", "c"]
     cats_1 = Xy.get_categories(export_to_arrow=True)
     assert cats_0.to_arrow() == cats_1.to_arrow()
 
diff --git a/python-package/xgboost/testing/with_skl.py b/python-package/xgboost/testing/with_skl.py
index b9cdfdd518cf..a91074014ecf 100644
--- a/python-package/xgboost/testing/with_skl.py
+++ b/python-package/xgboost/testing/with_skl.py
@@ -7,7 +7,8 @@
 import pytest
 
 from ..core import DMatrix
-from ..sklearn import XGBClassifier, XGBRFRegressor
+from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
+from .ordinal import make_recoded
 from .utils import Device
 
 
@@ -132,3 +133,34 @@ def run_housing_rf_regression(tree_method: str, device: Device) -> None:
     with pytest.raises(NotImplementedError):
         rfreg.set_params(early_stopping_rounds=10)
         rfreg.fit(X, y)
+
+
+def run_recoding(device: Device) -> None:
+    """Test re-coding for training continuation."""
+    enc, reenc, y, _, _ = make_recoded(device, n_features=16)
+    reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
+    reg.fit(enc, y, eval_set=[(reenc, y)])
+    results_0 = reg.evals_result()
+
+    booster = reg.get_booster()
+    assert not booster.get_categories().empty()
+
+    reg = XGBRegressor(enable_categorical=True, n_estimators=2, device=device)
+    reg.fit(reenc, y, xgb_model=booster, eval_set=[(enc, y)])
+    results_1 = reg.evals_result()
+
+    booster = reg.get_booster()
+    assert booster.num_boosted_rounds() == 4
+    assert not booster.get_categories().empty()
+
+    reg = XGBRegressor(enable_categorical=True, n_estimators=4, device=device)
+    reg.fit(enc, y, eval_set=[(reenc, y)])
+    results_2 = reg.evals_result()
+
+    np.testing.assert_allclose(
+        results_2["validation_0"]["rmse"],
+        results_0["validation_0"]["rmse"] + results_1["validation_0"]["rmse"],
+    )
+
+    np.testing.assert_allclose(reg.predict(reenc), reg.predict(enc))
+    np.testing.assert_allclose(reg.apply(reenc), reg.apply(enc))
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index ee4c9e1ad2d4..e4092538dd6c 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -4,7 +4,18 @@
 import copy
 import os
 import weakref
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
 
 import numpy as np
 
@@ -15,7 +26,7 @@
     EvaluationMonitor,
     TrainingCallback,
 )
-from .compat import SKLEARN_INSTALLED, DataFrame, XGBStratifiedKFold
+from .compat import SKLEARN_INSTALLED, XGBStratifiedKFold
 from .core import (
     Booster,
     DMatrix,
@@ -26,6 +37,9 @@
     _RefMixIn,
 )
 
+if TYPE_CHECKING:
+    from pandas import DataFrame as PdDataFrame
+
 _CVFolds = Sequence["CVPack"]
 
 
@@ -435,7 +449,7 @@ def cv(
     callbacks: Optional[Sequence[TrainingCallback]] = None,
     shuffle: bool = True,
     custom_metric: Optional[Metric] = None,
-) -> Union[Dict[str, float], DataFrame]:
+) -> Union[Dict[str, float], "PdDataFrame"]:
     # pylint: disable = invalid-name
     """Cross-validation with given parameters.
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1443eebe08a7..173856d213d8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -765,7 +765,7 @@ typedef  void * CategoriesHandle;  // NOLINT
  * @brief Create an opaque handle to the internal container.
  *
  * @param handle An instance of the data matrix.
- * @param out     Created handle to the category container
+ * @param out    Created handle to the category container. Set to NULL if there's no category.
  *
  * @return 0 when success, -1 when failure happens.
  */
@@ -776,9 +776,13 @@ XGB_DLL int XGDMatrixGetCategories(DMatrixHandle handle, char const * /*config*/
 
   auto const p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
   auto const cats = p_fmat->Cats();
-  auto new_cats = CopyCatContainer(p_fmat->Ctx(), cats, p_fmat->Info().num_col_);
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new_cats;
+  if (cats->Empty()) {
+    out = nullptr;
+  } else {
+    auto new_cats = CopyCatContainer(p_fmat->Ctx(), cats, p_fmat->Info().num_col_);
+    *out = new_cats;
+  }
 
   API_END()
 }
@@ -799,14 +803,21 @@ XGB_DLL int XGDMatrixGetCategoriesExportToArrow(DMatrixHandle handle, char const
   auto const p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
   auto const cats = p_fmat->Cats();
   auto n_features = p_fmat->Info().num_col_;
-  // Create a new container
-  auto new_cats = CopyCatContainer(p_fmat->Ctx(), cats, n_features);
+
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new_cats;
-  // Export to arrow
-  auto &ret_str = p_fmat->GetThreadLocal().ret_str;
   xgboost_CHECK_C_ARG_PTR(export_out);
-  GetCategoriesImpl(new_cats->HostView(), n_features, &ret_str, export_out);
+
+  if (cats->Empty()) {
+    *out = nullptr;
+    *export_out = nullptr;
+  } else {
+    // Create a new container
+    auto new_cats = CopyCatContainer(p_fmat->Ctx(), cats, n_features);
+    *out = new_cats;
+    // Export to arrow
+    auto &ret_str = p_fmat->GetThreadLocal().ret_str;
+    GetCategoriesImpl(new_cats->HostView(), n_features, &ret_str, export_out);
+  }
 
   API_END();
 }
@@ -1758,9 +1769,13 @@ XGB_DLL int XGBoosterGetCategories(DMatrixHandle handle, char const * /*config*/
 
   auto *bst = static_cast<Learner *>(handle);
   auto const cats = bst->Cats();
-  auto new_cats = CopyCatContainer(bst->Ctx(), cats, bst->GetNumFeature());
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new_cats;
+  if (cats->Empty()) {
+    out = nullptr;
+  } else {
+    auto new_cats = CopyCatContainer(bst->Ctx(), cats, bst->GetNumFeature());
+    *out = new_cats;
+  }
 
   API_END()
 }
@@ -1775,14 +1790,21 @@ XGB_DLL int XGBoosterGetCategoriesExportToArrow(BoosterHandle handle, char const
   auto *bst = static_cast<Learner *>(handle);
   auto const cats = bst->Cats();
   auto n_features = bst->GetNumFeature();
-  // Create a new container
-  auto new_cats = CopyCatContainer(bst->Ctx(), cats, n_features);
+
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new_cats;
-  // Export to arrow
-  auto &ret_str = bst->GetThreadLocal().ret_str;
   xgboost_CHECK_C_ARG_PTR(export_out);
-  GetCategoriesImpl(new_cats->HostView(), n_features, &ret_str, export_out);
+
+  if (cats->Empty()) {
+    *out = nullptr;
+    *export_out = nullptr;
+  } else {
+    // Create a new container
+    auto new_cats = CopyCatContainer(bst->Ctx(), cats, n_features);
+    *out = new_cats;
+    // Export to arrow
+    auto &ret_str = bst->GetThreadLocal().ret_str;
+    GetCategoriesImpl(new_cats->HostView(), n_features, &ret_str, export_out);
+  }
 
   API_END()
 }
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 491823413f02..495e8af0d752 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -476,7 +476,7 @@ class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch>
 };
 
 inline auto MakeEncColumnarBatch(Context const* ctx, ColumnarAdapter const* adapter) {
-  auto cats = std::make_unique<CatContainer>(adapter->RefCats());
+  auto cats = std::make_unique<CatContainer>(adapter->RefCats(), true);
   cats->Sort(ctx);
   auto [acc, mapping] = cpu_impl::MakeCatAccessor(ctx, adapter->Cats(), cats.get());
   return std::tuple{EncColumnarAdapterBatch{adapter->Columns(), acc}, std::move(mapping)};
diff --git a/src/data/cat_container.cc b/src/data/cat_container.cc
index ae93e6b25466..096dfcd67900 100644
--- a/src/data/cat_container.cc
+++ b/src/data/cat_container.cc
@@ -14,7 +14,8 @@
 #include "xgboost/json.h"         // for Json
 
 namespace xgboost {
-CatContainer::CatContainer(enc::HostColumnsView const& df) : CatContainer{} {
+CatContainer::CatContainer(enc::HostColumnsView const& df, bool is_ref) : CatContainer{} {
+  this->is_ref_ = is_ref;
   this->n_total_cats_ = df.n_total_cats;
   if (this->n_total_cats_ == 0) {
     return;
diff --git a/src/data/cat_container.cu b/src/data/cat_container.cu
index ff16e6229441..9434e9fa6baf 100644
--- a/src/data/cat_container.cu
+++ b/src/data/cat_container.cu
@@ -140,7 +140,9 @@ CatContainer::CatContainer()  // NOLINT
     : cpu_impl_{std::make_unique<cpu_impl::CatContainerImpl>()},
       cu_impl_{std::make_unique<cuda_impl::CatContainerImpl>()} {}
 
-CatContainer::CatContainer(Context const* ctx, enc::DeviceColumnsView const& df) : CatContainer{} {
+CatContainer::CatContainer(Context const* ctx, enc::DeviceColumnsView const& df, bool is_ref)
+    : CatContainer{} {
+  this->is_ref_ = is_ref;
   this->n_total_cats_ = df.n_total_cats;
 
   this->feature_segments_.SetDevice(ctx->Device());
diff --git a/src/data/cat_container.h b/src/data/cat_container.h
index c95240826048..15a02df583ad 100644
--- a/src/data/cat_container.h
+++ b/src/data/cat_container.h
@@ -144,9 +144,9 @@ class CatContainer {
 
  public:
   CatContainer();
-  explicit CatContainer(enc::HostColumnsView const& df);
+  explicit CatContainer(enc::HostColumnsView const& df, bool is_ref);
 #if defined(XGBOOST_USE_CUDA)
-  explicit CatContainer(Context const* ctx, enc::DeviceColumnsView const& df);
+  explicit CatContainer(Context const* ctx, enc::DeviceColumnsView const& df, bool is_ref);
 #endif  // defined(XGBOOST_USE_CUDA)
   ~CatContainer();
 
@@ -162,6 +162,7 @@ class CatContainer {
    *        this method returns True.
    */
   [[nodiscard]] bool Empty() const;
+  [[nodiscard]] bool NeedRecode() const { return !this->Empty() && !this->is_ref_; }
 
   [[nodiscard]] std::size_t NumFeatures() const;
   /**
@@ -220,6 +221,7 @@ class CatContainer {
 #if defined(XGBOOST_USE_CUDA)
   std::unique_ptr<cuda_impl::CatContainerImpl> cu_impl_;
 #endif  // defined(XGBOOST_USE_CUDA)
+  bool is_ref_{false};
 };
 
 /**
diff --git a/src/data/data.cc b/src/data/data.cc
index bc78e3ffc928..4c5d990d91ff 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -876,8 +876,8 @@ bool MetaInfo::ShouldHaveLabels() const {
 
 void MetaInfo::Cats(std::shared_ptr<CatContainer> cats) {
   this->cats_ = std::move(cats);
-  CHECK_LT(cats_->NumFeatures(),
-           static_cast<decltype(cats->NumFeatures())>(std::numeric_limits<bst_cat_t>::max()));
+  CHECK_LT(cats_->NumCatsTotal(),
+           static_cast<decltype(cats->NumCatsTotal())>(std::numeric_limits<bst_cat_t>::max()));
 }
 
 using DMatrixThreadLocal =
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index f140f816219b..7c60ea3e7bd4 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -141,7 +141,7 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
 };
 
 inline auto MakeEncColumnarBatch(Context const* ctx, CudfAdapter const* adapter) {
-  auto cats = std::make_unique<CatContainer>(ctx, adapter->RefCats());
+  auto cats = std::make_unique<CatContainer>(ctx, adapter->RefCats(), true);
   cats->Sort(ctx);
   auto [acc, mapping] = ::xgboost::cuda_impl::MakeCatAccessor(ctx, adapter->DCats(), cats.get());
   return std::tuple{EncCudfAdapterBatch{adapter->Columns(), acc, adapter->NumRows()},
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index 8321eef55750..33817363fda3 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -4,8 +4,11 @@
 
 #include "proxy_dmatrix.h"
 
-#include <memory>  // for shared_ptr
+#include <memory>       // for shared_ptr
+#include <type_traits>  // for is_same_v
 
+#include "../common/type.h"   // for GetValueT
+#include "adapter.h"          // for ColumnarAdapter
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"     // for DMatrix
 #include "xgboost/logging.h"
@@ -92,4 +95,21 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
   p_fmat->Info().Extend(proxy->Info(), /*accumulate_rows=*/false, true);
   return p_fmat;
 }
+
+[[nodiscard]] bool BatchCatsIsRef(DMatrixProxy const *proxy) {
+  if (proxy->Device().IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    return cuda_impl::BatchCatsIsRef(proxy);
+#else
+    common::AssertGPUSupport();
+#endif
+  }
+  return HostAdapterDispatch<false>(proxy, [&](auto const &adapter) {
+    using AdapterT = typename common::GetValueT<decltype(adapter)>::element_type;
+    if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
+      return adapter->HasRefCategorical();
+    }
+    return false;
+  });
+}
 }  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index 6617cd314925..bb0fe621d695 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -55,6 +55,16 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
   return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
 }
 
+[[nodiscard]] bool BatchCatsIsRef(DMatrixProxy const* proxy) {
+  return Dispatch<false>(proxy, [&](auto const& adapter) {
+    using AdapterT = typename common::GetValueT<decltype(adapter)>::element_type;
+    if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
+      return adapter->HasRefCategorical();
+    }
+    return false;
+  });
+}
+
 [[nodiscard]] enc::DeviceColumnsView BatchCats(DMatrixProxy const* proxy) {
   return Dispatch<false>(proxy, [&](auto const& adapter) {
     using AdapterT = typename common::GetValueT<decltype(adapter)>::element_type;
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 9915c5ec5241..1a5ef5ce5145 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -254,6 +254,7 @@ namespace cuda_impl {
 [[nodiscard]] bst_idx_t BatchSamples(DMatrixProxy const*);
 [[nodiscard]] bst_idx_t BatchColumns(DMatrixProxy const*);
 #if defined(XGBOOST_USE_CUDA)
+[[nodiscard]] bool BatchCatsIsRef(DMatrixProxy const*);
 [[nodiscard]] enc::DeviceColumnsView BatchCats(DMatrixProxy const*);
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace cuda_impl
@@ -305,5 +306,6 @@ namespace cpu_impl {
   });
 }
 }  // namespace cpu_impl
+[[nodiscard]] bool BatchCatsIsRef(DMatrixProxy const* proxy);
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
diff --git a/src/data/quantile_dmatrix.cc b/src/data/quantile_dmatrix.cc
index a1429ffd3fb3..e8cdbce16751 100644
--- a/src/data/quantile_dmatrix.cc
+++ b/src/data/quantile_dmatrix.cc
@@ -126,7 +126,8 @@ void GetDataShape(Context const* ctx, DMatrixProxy* proxy,
       collective::SafeColl(collective::Allreduce(ctx, &info.n_features, collective::Op::kMax));
       info.column_sizes.clear();
       info.column_sizes.resize(info.n_features, 0);
-      p_info->cats = std::make_shared<CatContainer>(cpu_impl::BatchCats(proxy));
+      p_info->cats =
+          std::make_shared<CatContainer>(cpu_impl::BatchCats(proxy), BatchCatsIsRef(proxy));
     } else {
       CHECK_EQ(info.n_features, BatchColumns(proxy)) << "Inconsistent number of columns.";
       auto cats = cpu_impl::BatchCats(proxy);
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index ff81fefaa9f2..ab5052b3875a 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -80,7 +80,8 @@ void MakeSketches(Context const* ctx,
     auto cats = cuda_impl::BatchCats(proxy);
     if (ext_info.n_features == 0) {
       ext_info.n_features = data::BatchColumns(proxy);
-      ext_info.cats = std::make_shared<CatContainer>(p_ctx, cats);
+      ext_info.cats =
+          std::make_shared<CatContainer>(p_ctx, cats, ::xgboost::data::BatchCatsIsRef(proxy));
       auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
                                       collective::Op::kMax);
       SafeColl(rc);
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index a4e5913d767b..8adbad991d70 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -305,9 +305,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
 
   if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
     if (adapter->HasRefCategorical()) {
-      info_.Cats(std::make_shared<CatContainer>(adapter->RefCats()));
+      info_.Cats(std::make_shared<CatContainer>(adapter->RefCats(), true));
     } else if (adapter->HasCategorical()) {
-      info_.Cats(std::make_shared<CatContainer>(adapter->Cats()));
+      info_.Cats(std::make_shared<CatContainer>(adapter->Cats(), false));
     }
   }
 
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index de50441f8efb..5da056e4b1af 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -58,9 +58,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
 
   if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
     if (adapter->HasRefCategorical()) {
-      info_.Cats(std::make_shared<CatContainer>(&ctx, adapter->RefCats()));
+      info_.Cats(std::make_shared<CatContainer>(&ctx, adapter->RefCats(), true));
     } else if (adapter->HasCategorical()) {
-      info_.Cats(std::make_shared<CatContainer>(&ctx, adapter->Cats()));
+      info_.Cats(std::make_shared<CatContainer>(&ctx, adapter->Cats(), false));
     }
   }
   this->info_.SynchronizeNumberOfColumns(&ctx, data_split_mode);
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index fe91fe88b93e..2b3f800c81d7 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -47,11 +47,11 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
 
   auto get_cats = [](DMatrixProxy const *proxy) {
     if (proxy->Ctx()->IsCPU()) {
-      return std::make_shared<CatContainer>(cpu_impl::BatchCats(proxy));
+      return std::make_shared<CatContainer>(cpu_impl::BatchCats(proxy), BatchCatsIsRef(proxy));
     } else {
-      common::AssertGPUSupport();
 #if defined(XGBOOST_USE_CUDA)
-      return std::make_shared<CatContainer>(proxy->Ctx(), cuda_impl::BatchCats(proxy));
+      return std::make_shared<CatContainer>(proxy->Ctx(), cuda_impl::BatchCats(proxy),
+                                            BatchCatsIsRef(proxy));
 #else
       common::AssertGPUSupport();
       return std::make_shared<CatContainer>();
diff --git a/src/encoder/ordinal.cuh b/src/encoder/ordinal.cuh
index daff792f05cf..2759395f2cac 100644
--- a/src/encoder/ordinal.cuh
+++ b/src/encoder/ordinal.cuh
@@ -254,7 +254,7 @@ void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc,
         auto f_idx = dh::SegmentId(new_enc.feature_segments, i);
         std::int32_t searched_idx{detail::NotFound()};
         auto const& col = orig_enc.columns[f_idx];
-        cuda::std::visit(Overloaded{[&](CatStrArrayView const& str) {
+        cuda::std::visit(Overloaded{[&](CatStrArrayView const&) {
                                       auto op = cuda_impl::SegmentedSearchSortedStrOp{
                                           orig_enc, sorted_idx, new_enc, f_idx};
                                       searched_idx = op(i);
diff --git a/src/encoder/ordinal.h b/src/encoder/ordinal.h
index 999f5a447102..cb8cf8855a34 100644
--- a/src/encoder/ordinal.h
+++ b/src/encoder/ordinal.h
@@ -349,7 +349,7 @@ void Recode(ExecPolicy const &policy, HostColumnsView orig_enc, Span<std::int32_
     auto report = [&] {
       std::stringstream ss;
       ss << "Invalid new DataFrame input for the: " << f_idx << "th feature (0-based). "
-         << "The data type doesn't match the one used in the training dataset."
+         << "The data type doesn't match the one used in the training dataset. "
          << "Both should be either numeric or categorical. For a categorical feature, the index "
             "type must match between the training and test set.";
       policy.Error(ss.str());
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 2d2aea31a9b6..2536d62749ac 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -725,7 +725,7 @@ class CPUPredictor : public Predictor {
       }
     };
 
-    if (model.Cats()->HasCategorical() && !p_fmat->Cats()->Empty()) {
+    if (model.Cats()->HasCategorical() && p_fmat->Cats()->NeedRecode()) {
       auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
       launch(acc);
     } else {
@@ -841,7 +841,7 @@ class CPUPredictor : public Predictor {
 
     if constexpr (std::is_same_v<Adapter, data::ColumnarAdapter>) {
       // Make specialization for DataFrame where we need encoding.
-      if (model.Cats()->HasCategorical()) {
+      if (model.Cats()->HasCategorical() && !m->Cats().Empty()) {
         auto [acc, mapping] = MakeCatAccessor(ctx_, m->Cats(), model.Cats());
         return launch(acc);
       }
@@ -930,7 +930,7 @@ class CPUPredictor : public Predictor {
     // Start collecting the prediction
     for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
       // parallel over local batch
-      if (model.Cats()->HasCategorical() && !p_fmat->Cats()->Empty()) {
+      if (model.Cats()->HasCategorical() && p_fmat->Cats()->NeedRecode()) {
         auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
         launch(batch, std::move(acc));
       } else {
@@ -985,7 +985,7 @@ class CPUPredictor : public Predictor {
         }
       }
     };
-    if (model.Cats()->HasCategorical() && !p_fmat->CatsShared()->Empty()) {
+    if (model.Cats()->HasCategorical() && p_fmat->CatsShared()->NeedRecode()) {
       auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
       launch(acc);
     } else {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index bad16ea3b7f2..1313384caf6b 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -1049,7 +1049,9 @@ class GPUPredictor : public xgboost::Predictor {
     }
 
     CHECK_LE(p_fmat->Info().num_col_, model.learner_model_param->num_feature);
-    auto new_enc = p_fmat->Cats()->DeviceView(ctx_);
+    auto new_enc =
+        p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
+
     if (p_fmat->PageExists<SparsePage>()) {
       bst_idx_t batch_offset = 0;
       for (auto& page : p_fmat->GetBatches<SparsePage>()) {
@@ -1208,7 +1210,8 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> device_paths;
     DeviceModel d_model;
     d_model.Init(model, 0, tree_end, ctx_->Device());
-    auto new_enc = p_fmat->Cats()->DeviceView(this->ctx_);
+    auto new_enc =
+        p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
 
     dh::device_vector<uint32_t> categories;
     ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
@@ -1292,7 +1295,8 @@ class GPUPredictor : public xgboost::Predictor {
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
     ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
-    auto new_enc = p_fmat->Cats()->DeviceView(ctx_);
+    auto new_enc =
+        p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
 
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
@@ -1367,7 +1371,8 @@ class GPUPredictor : public xgboost::Predictor {
     }
 
     bst_feature_t n_features = info.num_col_;
-    auto new_enc = p_fmat->Cats()->DeviceView(this->ctx_);
+    auto new_enc =
+        p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
     LaunchConfig cfg{this->ctx_, n_features};
 
     if (p_fmat->PageExists<SparsePage>()) {
diff --git a/tests/cpp/data/test_cat_container.h b/tests/cpp/data/test_cat_container.h
index c8315516ebae..f869bb3c24c5 100644
--- a/tests/cpp/data/test_cat_container.h
+++ b/tests/cpp/data/test_cat_container.h
@@ -24,12 +24,12 @@ inline void DeviceCheck(CatContainer const& cats) {
 }
 
 [[nodiscard]] inline CatContainer FromDf(Context const*, enc::HostColumnsView df) {
-  return CatContainer{df};
+  return CatContainer{df, false};
 }
 
 #if defined(XGBOOST_USE_CUDA)
 [[nodiscard]] inline CatContainer FromDf(Context const* ctx, enc::DeviceColumnsView df) {
-  return CatContainer{ctx, df};
+  return CatContainer{ctx, df, false};
 }
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace test_cat_detail
diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py
index 8c263901d94a..a4d60170c816 100644
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -5,11 +5,18 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.compat import is_dataframe
 from xgboost.testing.data import run_base_margin_info
 
 cudf = pytest.importorskip("cudf")
 
 
+def test_type_check() -> None:
+    df = cudf.DataFrame([[1, 2.0], [2, 3.0]], columns=["a", "b"])
+    assert is_dataframe(df)
+    assert is_dataframe(df.a)
+
+
 def dmatrix_from_cudf(input_type, DMatrixT, missing=np.nan):
     """Test constructing DMatrix from cudf"""
     import pandas as pd
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 2c9bbabf0f5a..6b293bba2665 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -4,19 +4,10 @@
 import numpy as np
 import pytest
 from hypothesis import assume, given, settings, strategies
+from hypothesis.extra.pandas import column, data_frames, range_indexes
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.compat import PANDAS_INSTALLED
-
-if PANDAS_INSTALLED:
-    from hypothesis.extra.pandas import column, data_frames, range_indexes
-else:
-
-    def noop(*args, **kwargs):
-        pass
-
-    column, data_frames, range_indexes = noop, noop, noop
 
 sys.path.append("tests/python")
 from test_predict import run_predict_leaf  # noqa
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index a7be2154fd85..83b779c34211 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -14,6 +14,7 @@
     run_boost_from_prediction_binary,
     run_boost_from_prediction_multi_clasas,
     run_housing_rf_regression,
+    run_recoding,
 )
 
 pytestmark = pytest.mark.skipif(**tm.no_sklearn())
@@ -315,3 +316,8 @@ def worker(ordinal: int, correct_ordinal: bool) -> None:
             fut.result()
 
     cp.cuda.runtime.setDevice(0)
+
+
+@pytest.mark.skipif(**tm.no_cudf())
+def test_recoding() -> None:
+    run_recoding("cuda")
diff --git a/tests/python/test_with_arrow.py b/tests/python/test_with_arrow.py
index 827aa1709929..3cfc07296f65 100644
--- a/tests/python/test_with_arrow.py
+++ b/tests/python/test_with_arrow.py
@@ -5,6 +5,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.compat import is_dataframe
 from xgboost.core import DataSplitMode
 
 pytestmark = pytest.mark.skipif(
@@ -17,6 +18,14 @@
 import pyarrow.csv as pc
 
 
+def test_type_check() -> None:
+    df = pd.DataFrame(
+        [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
+    )
+    table = pa.Table.from_pandas(df)
+    assert is_dataframe(table)
+
+
 class TestArrowTable:
     def test_arrow_table(self):
         df = pd.DataFrame(
diff --git a/tests/python/test_with_modin.py b/tests/python/test_with_modin.py
index 875c5f7f18c5..ea1ed691b820 100644
--- a/tests/python/test_with_modin.py
+++ b/tests/python/test_with_modin.py
@@ -4,6 +4,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.compat import is_dataframe
 from xgboost.testing.data import run_base_margin_info
 
 try:
@@ -15,6 +16,12 @@
 pytestmark = pytest.mark.skipif(**tm.no_modin())
 
 
+def test_type_check() -> None:
+    df = md.DataFrame([[1, 2.0], [2, 3.0]], columns=["a", "b"])
+    assert is_dataframe(df)
+    assert is_dataframe(df.a)
+
+
 class TestModin:
     @pytest.mark.xfail
     def test_modin(self) -> None:
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index c8aee49b6c89..5a3e28c44222 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -5,6 +5,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.compat import is_dataframe
 from xgboost.core import DataSplitMode
 from xgboost.testing.data import pd_arrow_dtypes, pd_dtypes, run_base_margin_info
 from xgboost.testing.utils import predictor_equal
@@ -22,10 +23,18 @@
 rng = np.random.RandomState(1994)
 
 
+def test_type_check() -> None:
+    df = pd.DataFrame([[1, 2.0], [2, 3.0]], columns=["a", "b"])
+    assert is_dataframe(df)
+    assert is_dataframe(df.a)
+
+
 class TestPandas:
     def test_pandas(self, data_split_mode=DataSplitMode.ROW):
         world_size = xgb.collective.get_world_size()
         df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"])
+        assert is_dataframe(df)
+        assert is_dataframe(df.a)
         dm = xgb.DMatrix(df, label=pd.Series([1, 2]), data_split_mode=data_split_mode)
         assert dm.num_row() == 2
         if data_split_mode == DataSplitMode.ROW:
diff --git a/tests/python/test_with_polars.py b/tests/python/test_with_polars.py
index 6f0bb392ae73..c3686448f8a0 100644
--- a/tests/python/test_with_polars.py
+++ b/tests/python/test_with_polars.py
@@ -9,10 +9,17 @@
 import pytest
 
 import xgboost as xgb
+from xgboost.compat import is_dataframe
 
 pl = pytest.importorskip("polars")
 
 
+def test_type_check() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+    assert is_dataframe(df)
+    assert is_dataframe(df["a"])
+
+
 @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
 def test_polars_basic(
     DMatrixT: Union[Type[xgb.DMatrix], Type[xgb.QuantileDMatrix]],
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 583da9ebf500..2adea0284d1c 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -19,6 +19,7 @@
     run_boost_from_prediction_binary,
     run_boost_from_prediction_multi_clasas,
     run_housing_rf_regression,
+    run_recoding,
 )
 
 rng = np.random.RandomState(1994)
@@ -1569,7 +1570,7 @@ def test_doc_link() -> None:
         assert f"xgboost.{name}" in link
 
 
-def test_apply_method():
+def test_apply_method() -> None:
     import pandas as pd
 
     X_num = np.random.rand(5, 5)
@@ -1587,3 +1588,7 @@ def test_apply_method():
     model.set_params(enable_categorical=False)
     with pytest.raises(ValueError, match="`enable_categorical`"):
         model.apply(df)
+
+
+def test_recoding() -> None:
+    run_recoding("cpu")

From dcf8afe6ef718dc2127961d4346381f136cd85b8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 5 Aug 2025 11:41:12 +0800
Subject: [PATCH 124/224] Make CUDA lineinfo optional. (#11606)

---
 cmake/Utils.cmake | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 0a0c6e38e170..28d43b471388 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -112,13 +112,11 @@ function(xgboost_set_cuda_flags target)
   if(USE_DEVICE_DEBUG)
     target_compile_options(${target} PRIVATE
       $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>)
-  else()
-    target_compile_options(${target} PRIVATE
-      $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
   endif()
 
   if(USE_NVTX)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
+    target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
   endif()
 
   # Use CCCL we find before CUDA Toolkit to make sure we get newer headers as intended

From f0d6a256119fb38f8da8805703119c17cd279c66 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 6 Aug 2025 01:01:49 +0800
Subject: [PATCH 125/224] Small cleanups for predictors. (#11611)

- Reuse the proxy DMatrix dispatch logic for inplace-predict.
- Add block-based loop.
---
 src/common/quantile.cc         |   3 +
 src/common/threading_utils.h   |  29 ++--
 src/data/adapter.h             |   6 +-
 src/data/cat_container.h       |   4 +-
 src/data/gradient_index.cc     |   1 +
 src/data/proxy_dmatrix.cuh     |  26 ++-
 src/data/proxy_dmatrix.h       |  43 +++--
 src/predictor/cpu_predictor.cc | 282 +++++++++++++++------------------
 src/predictor/gpu_predictor.cu |  31 ++--
 9 files changed, 214 insertions(+), 211 deletions(-)

diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 415bfeb6a34d..49df99d6261b 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -110,12 +110,15 @@ void HostSketchContainer::PushAdapterBatch(Batch const &batch, size_t base_rowid
       data::_type const &batch, size_t base_rowid, MetaInfo const &info, float missing);
 
 INSTANTIATE(ArrayAdapterBatch)
+INSTANTIATE(DenseAdapterBatch)
 INSTANTIATE(CSRArrayAdapterBatch)
 INSTANTIATE(CSCArrayAdapterBatch)
 INSTANTIATE(SparsePageAdapterBatch)
 INSTANTIATE(ColumnarAdapterBatch)
 INSTANTIATE(EncColumnarAdapterBatch)
 
+#undef INSTANTIATE
+
 namespace {
 /**
  * @brief A view over gathered sketch values.
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 26c0925bc579..8288cbf04449 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -17,6 +17,7 @@
 #include <utility>      // for forward
 #include <vector>       // for vector
 
+#include "common.h"  // for DivRoundUp
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView
 
@@ -40,24 +41,21 @@ namespace xgboost::common {
 // Inspired by tbb::blocked_range
 class Range1d {
  public:
-  Range1d(size_t begin, size_t end): begin_(begin), end_(end) {
-    CHECK_LT(begin, end);
-  }
+  Range1d(std::size_t begin, std::size_t end) : begin_{begin}, end_{end} { CHECK_LT(begin, end); }
 
-  size_t begin() const {  // NOLINT
+  [[nodiscard]] std::size_t begin() const {  // NOLINT
     return begin_;
   }
-
-  size_t end() const {  // NOLINT
+  [[nodiscard]] std::size_t end() const {  // NOLINT
     return end_;
   }
+  [[nodiscard]] std::size_t Size() const { return this->end() - this->begin(); }
 
  private:
-  size_t begin_;
-  size_t end_;
+  std::size_t begin_;
+  std::size_t end_;
 };
 
-
 // Split 2d space to balanced blocks
 // Implementation of the class is inspired by tbb::blocked_range2d
 // However, TBB provides only (n x m) 2d range (matrix) separated by blocks. Example:
@@ -141,7 +139,7 @@ class BlockedSpace2d {
 
 // Wrapper to implement nested parallelism with simple omp parallel for
 template <typename Func>
-void ParallelFor2d(const BlockedSpace2d& space, int n_threads, Func&& func) {
+void ParallelFor2d(const BlockedSpace2d& space, std::int32_t n_threads, Func&& func) {
   static_assert(std::is_void_v<std::invoke_result_t<Func, std::size_t, Range1d>>);
   std::size_t n_blocks_in_space = space.Size();
   CHECK_GE(n_threads, 1);
@@ -253,6 +251,17 @@ void ParallelFor(Index size, std::int32_t n_threads, Func&& fn) {
   ParallelFor(size, n_threads, Sched::Static(), std::forward<Func>(fn));
 }
 
+template <std::size_t kBlockOfRowsSize, typename Index, typename Func>
+void ParallelFor1d(Index size, std::int32_t n_threads, Func&& fn) {
+  static_assert(std::is_void_v<std::invoke_result_t<Func, common::Range1d>>);
+  auto const n_blocks = DivRoundUp(size, kBlockOfRowsSize);
+  common::ParallelFor(n_blocks, n_threads, [&](auto block_id) {
+    auto const block_beg = block_id * kBlockOfRowsSize;
+    auto const block_size = std::min(static_cast<std::size_t>(size - block_beg), kBlockOfRowsSize);
+    fn(common::Range1d{block_beg, block_beg + block_size});
+  });
+}
+
 inline std::int32_t OmpGetThreadLimit() {
   std::int32_t limit = omp_get_thread_limit();
   CHECK_GE(limit, 1) << "Invalid thread limit for OpenMP.";
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 495e8af0d752..ddf6cf49efe7 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -129,6 +129,8 @@ class DenseAdapterBatch : public detail::NoMetaInfo {
   const Line GetLine(size_t idx) const {
     return Line(values_ + idx * num_features_, num_features_, idx);
   }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumCols() const { return num_features_; }
   static constexpr bool kIsRowMajor = true;
 
  private:
@@ -145,8 +147,8 @@ class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
         num_columns_(num_features) {}
   const DenseAdapterBatch& Value() const override { return batch_; }
 
-  size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return num_columns_; }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumColumns() const { return num_columns_; }
 
  private:
   DenseAdapterBatch batch_;
diff --git a/src/data/cat_container.h b/src/data/cat_container.h
index 15a02df583ad..75f8066793fe 100644
--- a/src/data/cat_container.h
+++ b/src/data/cat_container.h
@@ -253,8 +253,8 @@ struct CatAccessor {
  * @brief No-op accessor used to handle numeric data.
  */
 struct NoOpAccessor {
-  XGBOOST_DEVICE explicit NoOpAccessor(enc::MappingView const&) {}
-  NoOpAccessor() = default;
+  constexpr explicit NoOpAccessor(enc::MappingView const&) {}
+  constexpr NoOpAccessor() = default;
   template <typename T, typename Fidx>
   [[nodiscard]] XGBOOST_DEVICE T operator()(T fvalue, Fidx) const {
     return fvalue;
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 465802aa48d8..88dc7304b2cb 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -128,6 +128,7 @@ void GHistIndexMatrix::PushAdapterBatchColumns(Context const *ctx, Batch const &
 
 INSTANTIATION_PUSH(data::CSRArrayAdapterBatch)
 INSTANTIATION_PUSH(data::ArrayAdapterBatch)
+INSTANTIATION_PUSH(data::DenseAdapterBatch)
 INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 INSTANTIATION_PUSH(data::ColumnarAdapterBatch)
 INSTANTIATION_PUSH(data::EncColumnarAdapterBatch)
diff --git a/src/data/proxy_dmatrix.cuh b/src/data/proxy_dmatrix.cuh
index 838819343153..b71693b7a84d 100644
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -9,8 +9,14 @@
 
 namespace xgboost::data::cuda_impl {
 template <bool get_value = true, typename Fn>
-decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
+decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
+  auto has_type = [&] {
+    if (type_error) {
+      *type_error = false;
+    }
+  };
   if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
+    has_type();
     if constexpr (get_value) {
       auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
       return fn(value);
@@ -19,6 +25,7 @@ decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
       return fn(value);
     }
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
+    has_type();
     auto adapter = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
     if constexpr (get_value) {
       auto value = adapter->Value();
@@ -31,14 +38,19 @@ decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
       return fn(adapter);
     }
   } else {
-    LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
+    if (type_error) {
+      *type_error = true;
     } else {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
-      return fn(value);
+      LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
     }
   }
+
+  if constexpr (get_value) {
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+    return fn(value);
+  } else {
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+    return fn(value);
+  }
 }
 }  // namespace xgboost::data::cuda_impl
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 1a5ef5ce5145..5f5f9ed414a8 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -188,31 +188,34 @@ struct ExternalDataInfo {
  */
 template <bool get_value = true, typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
+  auto has_type = [&] {
+    if (type_error) {
+      *type_error = false;
+    }
+  };
   CHECK(proxy->Adapter().has_value());
-  if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
+  auto const& x = proxy->Adapter();
+  if (x.type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
+    has_type();
     if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(x)->Value();
       return fn(value);
     } else {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(x);
       return fn(value);
     }
-    if (type_error) {
-      *type_error = false;
-    }
-  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
+  } else if (x.type() == typeid(std::shared_ptr<ArrayAdapter>)) {
+    has_type();
     if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(x)->Value();
       return fn(value);
     } else {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(x);
       return fn(value);
     }
-    if (type_error) {
-      *type_error = false;
-    }
-  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ColumnarAdapter>)) {
-    auto adapter = std::any_cast<std::shared_ptr<ColumnarAdapter>>(proxy->Adapter());
+  } else if (x.type() == typeid(std::shared_ptr<ColumnarAdapter>)) {
+    has_type();
+    auto adapter = std::any_cast<std::shared_ptr<ColumnarAdapter>>(x);
     if constexpr (get_value) {
       auto value = adapter->Value();
       if (adapter->HasRefCategorical()) {
@@ -223,14 +226,20 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
     } else {
       return fn(adapter);
     }
-    if (type_error) {
-      *type_error = false;
+  } else if (x.type() == typeid(std::shared_ptr<data::DenseAdapter>)) {
+    has_type();
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<DenseAdapter>>(x)->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<DenseAdapter>>(x);
+      fn(value);
     }
   } else {
     if (type_error) {
       *type_error = true;
     } else {
-      LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
+      LOG(FATAL) << "Unknown type: " << x.type().name();
     }
   }
 
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 2536d62749ac..f8be3bd49bac 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -2,20 +2,17 @@
  * Copyright 2017-2025, XGBoost Contributors
  */
 #include <algorithm>  // for max, fill, min
-#include <any>        // for any, any_cast
 #include <cassert>    // for assert
 #include <cstddef>    // for size_t
 #include <cstdint>    // for uint32_t, int32_t, uint64_t
 #include <memory>     // for unique_ptr, shared_ptr
 #include <ostream>    // for char_traits, operator<<, basic_ostream
-#include <typeinfo>   // for type_info
 #include <vector>     // for vector
 
 #include "../collective/allreduce.h"          // for Allreduce
 #include "../collective/communicator-inl.h"   // for IsDistributed
 #include "../common/bitfield.h"               // for RBitField8
 #include "../common/column_matrix.h"          // for ColumnMatrix
-#include "../common/common.h"                 // for DivRoundUp
 #include "../common/error_msg.h"              // for InplacePredictProxy
 #include "../common/math.h"                   // for CheckNAN
 #include "../common/threading_utils.h"        // for ParallelFor
@@ -97,10 +94,10 @@ void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tre
 }  // namespace multi
 
 namespace {
-void PredictByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begin,
-                       bst_tree_t const tree_end, std::size_t const predict_offset,
-                       std::vector<RegTree::FVec> const &thread_temp, std::size_t const offset,
-                       std::size_t const block_size, linalg::MatrixView<float> out_predt) {
+void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begin,
+                            bst_tree_t const tree_end, std::size_t const predict_offset,
+                            common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
+                            linalg::MatrixView<float> out_predt) {
   for (bst_tree_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
     auto const &tree = *model.trees.at(tree_id);
     auto const &cats = tree.GetCategoriesMatrix();
@@ -110,14 +107,13 @@ void PredictByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begi
       if (has_categorical) {
         for (std::size_t i = 0; i < block_size; ++i) {
           auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-          multi::PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
-                                          t_predts);
+          multi::PredValueByOneTree<true>(fvec_tloc[i], *tree.GetMultiTargetTree(), cats, t_predts);
         }
       } else {
         for (std::size_t i = 0; i < block_size; ++i) {
           auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-          multi::PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(),
-                                           cats, t_predts);
+          multi::PredValueByOneTree<false>(fvec_tloc[i], *tree.GetMultiTargetTree(), cats,
+                                           t_predts);
         }
       }
     } else {
@@ -125,12 +121,12 @@ void PredictByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begi
       if (has_categorical) {
         for (std::size_t i = 0; i < block_size; ++i) {
           out_predt(predict_offset + i, gid) +=
-              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+              scalar::PredValueByOneTree<true>(fvec_tloc[i], tree, cats);
         }
       } else {
         for (std::size_t i = 0; i < block_size; ++i) {
           out_predt(predict_offset + i, gid) +=
-              scalar::PredValueByOneTree<false>(thread_temp[offset + i], tree, cats);
+              scalar::PredValueByOneTree<false>(fvec_tloc[i], tree, cats);
         }
       }
     }
@@ -139,12 +135,13 @@ void PredictByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begi
 
 template <typename DataView>
 void FVecFill(std::size_t const block_size, std::size_t const batch_offset,
-              bst_feature_t n_features, DataView *p_batch, std::size_t const fvec_offset,
-              std::vector<RegTree::FVec> *p_feats) {
-  auto &feats_vec = *p_feats;
+              bst_feature_t n_features, DataView *p_batch,
+              common::Span<RegTree::FVec> s_feats_vec) {
+  auto feats_vec = s_feats_vec.data();
+  // auto &feats_vec = *p_feats;
   auto &batch = *p_batch;
   for (std::size_t i = 0; i < block_size; ++i) {
-    RegTree::FVec &feats = feats_vec[fvec_offset + i];
+    RegTree::FVec &feats = feats_vec[i];
     if (feats.Size() == 0) {
       feats.Init(n_features);
     }
@@ -152,11 +149,10 @@ void FVecFill(std::size_t const block_size, std::size_t const batch_offset,
   }
 }
 
-void FVecDrop(std::size_t const block_size, std::size_t const fvec_offset,
-              std::vector<RegTree::FVec> *p_feats) {
+void FVecDrop(std::size_t const block_size, common::Span<RegTree::FVec> s_feats) {
+  auto p_feats = s_feats.data();
   for (size_t i = 0; i < block_size; ++i) {
-    RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
-    feats.Drop();
+    p_feats[i].Drop();
   }
 }
 
@@ -221,7 +217,7 @@ class GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView<EncAccess
         columns_{page_.Transpose()},
         base_rowid{_page.base_rowid} {}
 
-  [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float* out) const {
+  [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float *out) const {
     auto gridx = ridx + this->base_rowid;
     auto n_features = page_.Features();
 
@@ -307,29 +303,23 @@ class AdapterView : public DataToFeatVec<AdapterView<Adapter, EncAccessor>> {
 };
 
 template <std::size_t kBlockOfRowsSize, typename DataView>
-void PredictBatchByBlockOfRowsKernel(DataView const &batch, gbm::GBTreeModel const &model,
-                                     bst_tree_t tree_begin, bst_tree_t tree_end,
-                                     std::vector<RegTree::FVec> *p_thread_temp,
-                                     std::int32_t n_threads,
-                                     linalg::TensorView<float, 2> out_predt) {
-  auto &thread_temp = *p_thread_temp;
-
+void PredictBatchByBlockKernel(DataView const &batch, gbm::GBTreeModel const &model,
+                               bst_tree_t tree_begin, bst_tree_t tree_end,
+                               std::vector<RegTree::FVec> *p_thread_temp, std::int32_t n_threads,
+                               linalg::TensorView<float, 2> out_predt) {
+  auto fvec = common::Span{*p_thread_temp};
   // Parallel over local batches
   auto const n_samples = batch.Size();
   auto const n_features = model.learner_model_param->num_feature;
-  auto const n_blocks = common::DivRoundUp(n_samples, kBlockOfRowsSize);
 
-  common::ParallelFor(n_blocks, n_threads, [&](auto block_id) {
-    auto const batch_offset = block_id * kBlockOfRowsSize;
-    auto const block_size =
-        std::min(static_cast<std::size_t>(n_samples - batch_offset), kBlockOfRowsSize);
+  common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads, [&](auto &&block) {
     auto const fvec_offset = omp_get_thread_num() * kBlockOfRowsSize;
+    auto fvec_tloc = fvec.subspan(fvec_offset, block.Size());
 
-    FVecFill(block_size, batch_offset, n_features, &batch, fvec_offset, p_thread_temp);
-    // process block of rows through all trees to keep cache locality
-    PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
-                      fvec_offset, block_size, out_predt);
-    FVecDrop(block_size, fvec_offset, p_thread_temp);
+    FVecFill(block.Size(), block.begin(), n_features, &batch, fvec_tloc);
+    PredictBlockByAllTrees(model, tree_begin, tree_end, block.begin() + batch.base_rowid, fvec_tloc,
+                           block.Size(), out_predt);
+    FVecDrop(block.Size(), fvec_tloc);
   });
 }
 
@@ -350,7 +340,7 @@ float FillNodeMeanValues(RegTree const *tree, bst_node_t nidx, std::vector<float
   return result;
 }
 
-void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
+void FillNodeMeanValues(RegTree const *tree, std::vector<float> *mean_values) {
   auto n_nodes = tree->NumNodes();
   if (static_cast<decltype(n_nodes)>(mean_values->size()) == n_nodes) {
     return;
@@ -407,9 +397,13 @@ bool ShouldUseBlock(DMatrix *p_fmat) {
  */
 class ColumnSplitHelper {
  public:
-  ColumnSplitHelper(std::int32_t n_threads, gbm::GBTreeModel const &model, uint32_t tree_begin,
-                    uint32_t tree_end)
+  ColumnSplitHelper(std::int32_t n_threads, gbm::GBTreeModel const &model, bst_tree_t tree_begin,
+                    bst_tree_t tree_end)
       : n_threads_{n_threads}, model_{model}, tree_begin_{tree_begin}, tree_end_{tree_end} {
+    CHECK(!model.learner_model_param->IsVectorLeaf())
+        << "Predict DMatrix with column split" << MTNotImplemented();
+    CHECK(!model.Cats()->HasCategorical()) << "The re-coder doesn't support column split yet.";
+
     auto const n_trees = tree_end_ - tree_begin_;
     tree_sizes_.resize(n_trees);
     tree_offsets_.resize(n_trees);
@@ -422,6 +416,7 @@ class ColumnSplitHelper {
     for (decltype(tree_begin) i = 1; i < n_trees; i++) {
       tree_offsets_[i] = tree_offsets_[i - 1] + tree_sizes_[i - 1];
     }
+    // Add the size of the last tree since this is exclusive_scan
     bits_per_row_ = tree_offsets_.back() + tree_sizes_.back();
 
     InitThreadTemp(n_threads_ * kBlockOfRowsSize, &feat_vecs_);
@@ -439,6 +434,9 @@ class ColumnSplitHelper {
     if (this->model_.Cats()->HasCategorical()) {
       LOG(FATAL) << "Categorical feature is not yet supported with column-split.";
     }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "Predict with `QuantileDMatrix` is not supported with column-split.";
+    }
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(),
@@ -523,10 +521,11 @@ class ColumnSplitHelper {
     }
   }
 
-  void MaskAllTrees(std::size_t batch_offset, std::size_t fvec_offset, std::size_t block_size) {
+  void MaskAllTrees(std::size_t batch_offset, common::Span<RegTree::FVec> feat_vecs,
+                    std::size_t block_size) {
     for (auto tree_id = tree_begin_; tree_id < tree_end_; ++tree_id) {
       for (size_t i = 0; i < block_size; ++i) {
-        MaskOneTree(feat_vecs_[fvec_offset + i], tree_id, batch_offset + i);
+        MaskOneTree(feat_vecs[i], tree_id, batch_offset + i);
       }
     }
   }
@@ -563,7 +562,7 @@ class ColumnSplitHelper {
   void PredictAllTrees(std::vector<bst_float> *out_preds, std::size_t batch_offset,
                        std::size_t predict_offset, std::size_t num_group, std::size_t block_size) {
     auto &preds = *out_preds;
-    for (size_t tree_id = tree_begin_; tree_id < tree_end_; ++tree_id) {
+    for (auto tree_id = tree_begin_; tree_id < tree_end_; ++tree_id) {
       auto const gid = model_.tree_info[tree_id];
       for (size_t i = 0; i < block_size; ++i) {
         auto const result = PredictOneTree<predict_leaf>(tree_id, batch_offset + i);
@@ -581,32 +580,26 @@ class ColumnSplitHelper {
     auto const num_group = model_.learner_model_param->num_output_group;
 
     // parallel over local batch
-    auto const nsize = batch.Size();
-    auto const num_feature = model_.learner_model_param->num_feature;
-    auto const n_blocks = common::DivRoundUp(nsize, block_of_rows_size);
-    InitBitVectors(nsize);
-
-    // auto block_id has the same type as `n_blocks`.
-    common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
-      auto const batch_offset = block_id * block_of_rows_size;
-      auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
-                                       static_cast<std::size_t>(block_of_rows_size));
+    auto const n_samples = batch.Size();
+    auto const n_features = model_.learner_model_param->num_feature;
+
+    InitBitVectors(n_samples);
+    auto fvec = common::Span{feat_vecs_};
+
+    common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads_, [&](auto &&block) {
       auto const fvec_offset = omp_get_thread_num() * block_of_rows_size;
+      auto fvec_tloc = fvec.subspan(fvec_offset, block.Size());
 
-      FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
-      MaskAllTrees(batch_offset, fvec_offset, block_size);
-      FVecDrop(block_size, fvec_offset, &feat_vecs_);
+      FVecFill(block.Size(), block.begin(), n_features, &batch, fvec_tloc);
+      MaskAllTrees(block.begin(), fvec_tloc, block.Size());
+      FVecDrop(block.Size(), fvec_tloc);
     });
 
     AllreduceBitVectors(ctx);
 
-    // auto block_id has the same type as `n_blocks`.
-    common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
-      auto const batch_offset = block_id * block_of_rows_size;
-      auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
-                                       static_cast<std::size_t>(block_of_rows_size));
-      PredictAllTrees<predict_leaf>(out_preds, batch_offset, batch_offset + batch.base_rowid,
-                                    num_group, block_size);
+    common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads_, [&](auto &&block) {
+      PredictAllTrees<predict_leaf>(out_preds, block.begin(), block.begin() + batch.base_rowid,
+                                    num_group, block.Size());
     });
 
     ClearBitVectors();
@@ -616,8 +609,8 @@ class ColumnSplitHelper {
 
   std::int32_t const n_threads_;
   gbm::GBTreeModel const &model_;
-  uint32_t const tree_begin_;
-  uint32_t const tree_end_;
+  bst_tree_t const tree_begin_;
+  bst_tree_t const tree_end_;
 
   std::vector<std::size_t> tree_sizes_{};
   std::vector<std::size_t> tree_offsets_{};
@@ -703,11 +696,11 @@ class CPUPredictor : public Predictor {
         for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
           auto batch = GHistIndexMatrixView{page, std::forward<Enc>(acc), ft};
           if (blocked) {
-            PredictBatchByBlockOfRowsKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
-                                                              &feat_vecs, n_threads, out_predt);
+            PredictBatchByBlockKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
+                                                        &feat_vecs, n_threads, out_predt);
           } else {
-            PredictBatchByBlockOfRowsKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs,
-                                               n_threads, out_predt);
+            PredictBatchByBlockKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs, n_threads,
+                                         out_predt);
           }
         }
       } else {
@@ -715,11 +708,11 @@ class CPUPredictor : public Predictor {
         for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
           auto batch = SparsePageView{page.GetView(), page.base_rowid, std::forward<Enc>(acc)};
           if (blocked) {
-            PredictBatchByBlockOfRowsKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
-                                                              &feat_vecs, n_threads, out_predt);
+            PredictBatchByBlockKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
+                                                        &feat_vecs, n_threads, out_predt);
           } else {
-            PredictBatchByBlockOfRowsKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs,
-                                               n_threads, out_predt);
+            PredictBatchByBlockKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs, n_threads,
+                                         out_predt);
           }
         }
       }
@@ -768,16 +761,15 @@ class CPUPredictor : public Predictor {
             continue;
           }
           if (!approximate) {
-            CalculateContributions(*model.trees[j], feats, tree_mean_values,
-                                   &this_tree_contribs[0], condition, condition_feature);
+            CalculateContributions(*model.trees[j], feats, tree_mean_values, &this_tree_contribs[0],
+                                   condition, condition_feature);
           } else {
             CalculateContributionsApprox(*model.trees[j], feats, tree_mean_values,
                                          &this_tree_contribs[0]);
           }
           for (size_t ci = 0; ci < ncolumns; ++ci) {
             p_contribs[ci] +=
-                this_tree_contribs[ci] *
-                (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
+                this_tree_contribs[ci] * (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
           }
         }
         feats.Drop();
@@ -806,74 +798,58 @@ class CPUPredictor : public Predictor {
     this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin, tree_end);
   }
 
-  template <typename Adapter>
-  void DispatchedInplacePredict(std::any const &x, std::shared_ptr<DMatrix> p_m,
-                                gbm::GBTreeModel const &model, float missing,
-                                PredictionCacheEntry *out_preds, bst_tree_t tree_begin,
-                                bst_tree_t tree_end) const {
-    auto const n_threads = this->ctx_->Threads();
-    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
-    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
-        << "Number of columns in data must equal to trained model.";
-    CHECK(p_m);
-    CHECK_EQ(p_m->Info().num_row_, m->NumRows());
-    CHECK_EQ(p_m->Info().num_col_, m->NumColumns());
-    this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
-
-    bool blocked = ShouldUseBlock(p_m.get());
-
-    auto &predictions = out_preds->predictions.HostVector();
-    std::vector<RegTree::FVec> thread_temp;
-    InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &thread_temp);
-    bst_idx_t n_groups = model.learner_model_param->OutputLength();
-    auto out_predt = linalg::MakeTensorView(ctx_, predictions, m->NumRows(), n_groups);
-
-    auto launch = [&](auto &&acc) {
-      auto view = AdapterView{m.get(), missing, acc};
-      if (blocked) {
-        PredictBatchByBlockOfRowsKernel<kBlockOfRowsSize>(view, model, tree_begin, tree_end,
-                                                          &thread_temp, n_threads, out_predt);
-      } else {
-        PredictBatchByBlockOfRowsKernel<1>(view, model, tree_begin, tree_end, &thread_temp,
-                                           n_threads, out_predt);
-      }
-    };
-
-    if constexpr (std::is_same_v<Adapter, data::ColumnarAdapter>) {
-      // Make specialization for DataFrame where we need encoding.
-      if (model.Cats()->HasCategorical() && !m->Cats().Empty()) {
-        auto [acc, mapping] = MakeCatAccessor(ctx_, m->Cats(), model.Cats());
-        return launch(acc);
-      }
-    }
-    launch(NoOpAccessor{});
-  }
-
-  bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const &model, float missing,
-                      PredictionCacheEntry *out_preds, bst_tree_t tree_begin,
-                      bst_tree_t tree_end) const override {
+  [[nodiscard]] bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const &model,
+                                    float missing, PredictionCacheEntry *out_preds,
+                                    bst_tree_t tree_begin, bst_tree_t tree_end) const override {
     auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
     CHECK(proxy) << error::InplacePredictProxy();
     CHECK(!p_m->Info().IsColumnSplit())
         << "Inplace predict support for column-wise data split is not yet implemented.";
-    auto const &x = proxy->Adapter();
-
-    if (x.type() == typeid(std::shared_ptr<data::DenseAdapter>)) {
-      this->DispatchedInplacePredict<data::DenseAdapter>(x, p_m, model, missing, out_preds,
-                                                         tree_begin, tree_end);
-    } else if (x.type() == typeid(std::shared_ptr<data::ArrayAdapter>)) {
-      this->DispatchedInplacePredict<data::ArrayAdapter>(x, p_m, model, missing, out_preds,
-                                                         tree_begin, tree_end);
-    } else if (x.type() == typeid(std::shared_ptr<data::CSRArrayAdapter>)) {
-      this->DispatchedInplacePredict<data::CSRArrayAdapter>(x, p_m, model, missing, out_preds,
-                                                            tree_begin, tree_end);
-    } else if (x.type() == typeid(std::shared_ptr<data::ColumnarAdapter>)) {
-      this->DispatchedInplacePredict<data::ColumnarAdapter>(x, p_m, model, missing, out_preds,
-                                                            tree_begin, tree_end);
-    } else {
-      return false;
-    }
-    return true;
+    bool type_error = false;
+    data::HostAdapterDispatch<false>(
+        proxy,
+        [&](auto x) {
+          using AdapterT = typename decltype(x)::element_type;
+
+          auto const n_threads = this->ctx_->Threads();
+
+          CHECK_EQ(x->NumColumns(), model.learner_model_param->num_feature)
+              << "Number of columns in data must equal to trained model.";
+          CHECK(p_m);
+          CHECK_EQ(p_m->Info().num_row_, x->NumRows());
+          CHECK_EQ(p_m->Info().num_col_, x->NumColumns());
+          this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
+
+          bool blocked = ShouldUseBlock(p_m.get());
+
+          auto &predictions = out_preds->predictions.HostVector();
+          std::vector<RegTree::FVec> thread_temp;
+          InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &thread_temp);
+          bst_idx_t n_groups = model.learner_model_param->OutputLength();
+          auto out_predt = linalg::MakeTensorView(ctx_, predictions, x->NumRows(), n_groups);
+
+          auto launch = [&](auto &&acc) {
+            auto view = AdapterView{x.get(), missing, acc};
+            if (blocked) {
+              PredictBatchByBlockKernel<kBlockOfRowsSize>(view, model, tree_begin, tree_end,
+                                                          &thread_temp, n_threads, out_predt);
+            } else {
+              PredictBatchByBlockKernel<1>(view, model, tree_begin, tree_end, &thread_temp,
+                                           n_threads, out_predt);
+            }
+          };
+
+          if constexpr (std::is_same_v<AdapterT, data::ColumnarAdapter>) {
+            // Make specialization for DataFrame where we need encoding.
+            if (model.Cats()->HasCategorical() && !x->Cats().Empty()) {
+              auto [acc, mapping] = MakeCatAccessor(ctx_, x->Cats(), model.Cats());
+              return launch(acc);
+            }
+          }
+          launch(NoOpAccessor{});
+        },
+        &type_error);
+    return !type_error;
   }
 
   void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<float> *out_preds,
@@ -950,12 +926,12 @@ class CPUPredictor : public Predictor {
     auto const n_threads = this->ctx_->Threads();
     std::vector<RegTree::FVec> feat_vecs;
     InitThreadTemp(n_threads, &feat_vecs);
-    const MetaInfo& info = p_fmat->Info();
+    const MetaInfo &info = p_fmat->Info();
     // number of valid trees
     ntree_limit = GetTreeLimit(model.trees, ntree_limit);
     size_t const ncolumns = model.learner_model_param->num_feature + 1;
     // allocate space for (number of features + bias) times the number of rows
-    std::vector<bst_float>& contribs = out_contribs->HostVector();
+    std::vector<bst_float> &contribs = out_contribs->HostVector();
     contribs.resize(info.num_row_ * ncolumns * model.learner_model_param->num_output_group);
     // make sure contributions is zeroed, we could be reusing a previously
     // allocated one
@@ -1001,7 +977,7 @@ class CPUPredictor : public Predictor {
         << "Predict interaction contribution" << MTNotImplemented();
     CHECK(!p_fmat->Info().IsColumnSplit()) << "Predict interaction contribution support for "
                                               "column-wise data split is not yet implemented.";
-    const MetaInfo& info = p_fmat->Info();
+    const MetaInfo &info = p_fmat->Info();
     auto const ngroup = model.learner_model_param->num_output_group;
     auto const ncolumns = model.learner_model_param->num_feature;
     const unsigned row_chunk = ngroup * (ncolumns + 1) * (ncolumns + 1);
@@ -1009,7 +985,7 @@ class CPUPredictor : public Predictor {
     const unsigned crow_chunk = ngroup * (ncolumns + 1);
 
     // allocate space for (number of features^2) times the number of rows and tmp off/on contribs
-    std::vector<bst_float>& contribs = out_contribs->HostVector();
+    std::vector<bst_float> &contribs = out_contribs->HostVector();
     contribs.resize(info.num_row_ * ngroup * (ncolumns + 1) * (ncolumns + 1));
     HostDeviceVector<bst_float> contribs_off_hdv(info.num_row_ * ngroup * (ncolumns + 1));
     auto &contribs_off = contribs_off_hdv.HostVector();
@@ -1021,13 +997,13 @@ class CPUPredictor : public Predictor {
     // Compute the difference in effects when conditioning on each of the features on and off
     // see: Axiomatic characterizations of probabilistic and
     //      cardinal-probabilistic interaction indices
-    PredictContribution(p_fmat, &contribs_diag_hdv, model, ntree_limit,
-                        tree_weights, approximate, 0, 0);
+    PredictContribution(p_fmat, &contribs_diag_hdv, model, ntree_limit, tree_weights, approximate,
+                        0, 0);
     for (size_t i = 0; i < ncolumns + 1; ++i) {
-      PredictContribution(p_fmat, &contribs_off_hdv, model, ntree_limit,
-                          tree_weights, approximate, -1, i);
-      PredictContribution(p_fmat, &contribs_on_hdv, model, ntree_limit,
-                          tree_weights, approximate, 1, i);
+      PredictContribution(p_fmat, &contribs_off_hdv, model, ntree_limit, tree_weights, approximate,
+                          -1, i);
+      PredictContribution(p_fmat, &contribs_on_hdv, model, ntree_limit, tree_weights, approximate,
+                          1, i);
 
       for (size_t j = 0; j < info.num_row_; ++j) {
         for (std::remove_const_t<decltype(ngroup)> l = 0; l < ngroup; ++l) {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 1313384caf6b..d86639947bf0 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -22,6 +22,7 @@
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
+#include "../data/proxy_dmatrix.cuh"
 #include "../gbm/gbtree_model.h"
 #include "predict_fn.h"
 #include "xgboost/data.h"
@@ -1155,28 +1156,18 @@ class GPUPredictor : public xgboost::Predictor {
         enc::DeviceColumnsView{}, 0, &out_preds->predictions);
   }
 
-  bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const& model, float missing,
-                      PredictionCacheEntry* out_preds, bst_tree_t tree_begin,
-                      bst_tree_t tree_end) const override {
+  [[nodiscard]] bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const& model,
+                                    float missing, PredictionCacheEntry* out_preds,
+                                    bst_tree_t tree_begin, bst_tree_t tree_end) const override {
     auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
     CHECK(proxy) << error::InplacePredictProxy();
-    auto x = proxy->Adapter();
-    if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
-      this->DispatchedInplacePredict<data::CupyAdapter>(x, p_m, model, missing, out_preds,
-                                                        tree_begin, tree_end);
-    } else if (x.type() == typeid(std::shared_ptr<data::CudfAdapter>)) {
-      auto m = std::any_cast<std::shared_ptr<data::CudfAdapter>>(x);
-      if (m->HasCategorical()) {
-        this->DispatchedInplacePredict<data::CudfAdapter>(x, p_m, model, missing, out_preds,
-                                                          tree_begin, tree_end);
-      } else {
-        this->DispatchedInplacePredict<data::CudfAdapter>(x, p_m, model, missing, out_preds,
-                                                          tree_begin, tree_end);
-      }
-    } else {
-      return false;
-    }
-    return true;
+    bool type_error = false;
+    data::cuda_impl::Dispatch<false>(proxy, [&](auto x) {
+      using AdapterT = typename decltype(x)::element_type;
+      this->DispatchedInplacePredict<AdapterT>(x, p_m, model, missing, out_preds, tree_begin,
+                                               tree_end);
+    }, &type_error);
+    return !type_error;
   }
 
   void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,

From 6a7c85043128ca14d67c254fbaf178c9e581ac36 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 6 Aug 2025 13:21:00 -0700
Subject: [PATCH 126/224] Add Databento as a sponsor (#11617)

* Add Databento as a sponsor

* [skip ci] Fix badge for CI workflow
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f5af0cdc4e7f..67293764ec0d 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 ===========
 
 [![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci)
-[![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
+[![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost%20CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
 [![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE)
 [![CRAN Status Badge](https://www.r-pkg.org/badges/version/xgboost)](https://cran.r-project.org/web/packages/xgboost)
@@ -51,6 +51,7 @@ Become a sponsor and get a logo here. See details at [Sponsoring the XGBoost Pro
 <a href="https://www.nvidia.com/en-us/" target="_blank"><img src="https://raw.githubusercontent.com/xgboost-ai/xgboost-ai.github.io/master/images/sponsors/nvidia.jpg" alt="NVIDIA" width="72" height="72"></a>
 <a href="https://www.comet.com/site/?utm_source=xgboost&utm_medium=github&utm_content=readme" target="_blank"><img src="https://cdn.comet.ml/img/notebook_logo.png" height="72"></a>
 <a href="https://opencollective.com/guest-f5ebfc79" target="_blank"><img src="https://images.opencollective.com/guest-f5ebfc79/avatar/256.png" height="72"></a>
+<a href="https://databento.com/?utm_source=xgboost&utm_medium=sponsor&utm_content=display"><img src="https://raw.githubusercontent.com/xgboost-ai/xgboost-ai.github.io/refs/heads/master/images/sponsors/databento.png" height="72"></a>
 
 ### Backers
 [[Become a backer](https://opencollective.com/xgboost#backer)]

From 749eff7add89f0cf05dcb44799735f8c7d3de482 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 7 Aug 2025 05:56:25 +0800
Subject: [PATCH 127/224] Remove the use of all `__restrict__`. (#11616)

---
 src/tree/gpu_hist/evaluate_splits.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 61d221033d51..63b3c1d37d05 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -185,7 +185,7 @@ class EvaluateSplitAgent {
                                                   bool missing_left, bst_bin_t it,
                                                   GradientPairInt64 const &left_sum,
                                                   GradientPairInt64 const &right_sum,
-                                                  DeviceSplitCandidate *__restrict__ best_split) {
+                                                  DeviceSplitCandidate *best_split) {
     auto gain = thread_active
                     ? evaluator.CalcSplitGain(param, nidx, fidx, rounding.ToFloatingPoint(left_sum),
                                               rounding.ToFloatingPoint(right_sum))
@@ -210,7 +210,7 @@ class EvaluateSplitAgent {
   /**
    * \brief Partition-based split for categorical feature.
    */
-  __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
+  __device__ __forceinline__ void Partition(DeviceSplitCandidate *best_split,
                                             common::Span<bst_feature_t> sorted_idx,
                                             std::size_t node_offset,
                                             GPUTrainingParam const &param) {

From ec27877cc719c557eb4df4434652875129e04f26 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 7 Aug 2025 16:22:26 +0800
Subject: [PATCH 128/224] Compile with CTK13. (#11610)

---
 src/common/cuda_dr_utils.cc                  |  6 +++
 src/common/resource.cu                       | 22 +++++++---
 src/tree/gpu_hist/histogram.cu               | 46 +++++++++++---------
 tests/cpp/common/test_cuda_host_allocator.cu | 10 ++++-
 tests/cpp/common/test_device_vector.cu       |  2 +-
 5 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
index 82da96ea0864..f39b513c4ee4 100644
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@@ -23,8 +23,14 @@ CuDriverApi::CuDriverApi(std::int32_t cu_major, std::int32_t cu_minor, std::int3
   // similar to dlopen, but without the need to release a handle.
   auto safe_load = [](xgboost::StringView name, auto **fnptr) {
     cudaDriverEntryPointQueryResult status;
+#if (CUDA_VERSION / 1000) >= 13
+    dh::safe_cuda(cudaGetDriverEntryPointByVersion(name.c_str(), reinterpret_cast<void **>(fnptr),
+                                                   12080, cudaEnablePerThreadDefaultStream,
+                                                   &status));
+#else
     dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast<void **>(fnptr),
                                           cudaEnablePerThreadDefaultStream, &status));
+#endif  // (CUDA_VERSION / 1000) >= 13
     CHECK(status == cudaDriverEntryPointSuccess) << name;
     CHECK(*fnptr);
   };
diff --git a/src/common/resource.cu b/src/common/resource.cu
index ef662e3bd6e0..a4943e2d895c 100644
--- a/src/common/resource.cu
+++ b/src/common/resource.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #include "device_helpers.cuh"  // for CurrentDevice
 #include "resource.cuh"
@@ -18,14 +18,26 @@ CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::siz
               }},
       n_{length} {
   auto device = dh::CurrentDevice();
+#if (CUDA_VERSION / 1000) >= 13
+  cudaMemLocation loc;
+  loc.type = cudaMemLocationTypeDevice;
+  loc.id = device;
+#else
+  auto loc = device;
+#endif  // (CUDA_VERSION / 1000) >= 13
   dh::safe_cuda(
-      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetReadMostly, device));
-  dh::safe_cuda(cudaMemAdvise(handle_->base_ptr, handle_->base_size,
-                              cudaMemAdviseSetPreferredLocation, device));
+      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetReadMostly, loc));
   dh::safe_cuda(
-      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetAccessedBy, device));
+      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetPreferredLocation, loc));
+  dh::safe_cuda(
+      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetAccessedBy, loc));
+#if (CUDA_VERSION / 1000) >= 13
+  dh::safe_cuda(
+      cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, loc, 0, dh::DefaultStream()));
+#else
   dh::safe_cuda(
       cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, device, dh::DefaultStream()));
+#endif  // (CUDA_VERSION / 1000) >= 13
 }
 
 [[nodiscard]] void* CudaMmapResource::Data() {
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 3475509fd50f..4a6ef15e4eae 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -289,22 +289,12 @@ namespace {
 constexpr std::int32_t kBlockThreads = 1024;
 constexpr std::int32_t kItemsPerThread = 8;
 constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread; }
+template <auto Ker>
+using DeduceKernelT = std::decay_t<decltype(Ker)>;
 }  // namespace
 
 // Use auto deduction guide to workaround compiler error.
-template <typename Accessor,
-          auto GlobalCompr =
-              SharedMemHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>,
-          auto Global =
-              SharedMemHistKernel<Accessor, false, false, false, kBlockThreads, kItemsPerThread>,
-          auto SharedCompr =
-              SharedMemHistKernel<Accessor, true, false, true, kBlockThreads, kItemsPerThread>,
-          auto Shared =
-              SharedMemHistKernel<Accessor, false, false, true, kBlockThreads, kItemsPerThread>,
-          auto GlobalDense =
-              SharedMemHistKernel<Accessor, true, true, false, kBlockThreads, kItemsPerThread>,
-          auto SharedDense =
-              SharedMemHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>>
+template <typename Accessor>
 struct HistogramKernel {
   enum KernelType : std::size_t {
     kGlobalCompr = 0,
@@ -314,21 +304,35 @@ struct HistogramKernel {
     kGlobalDense = 4,
     kSharedDense = 5,
   };
-  // Kernel for working with dense Ellpack using the global memory.
-  decltype(GlobalCompr) global_compr_kernel{
+  // Kernel for working with compressed sparse Ellpack using the global memory.
+  using GlobalCompr = DeduceKernelT<
+      SharedMemHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>>;
+  GlobalCompr global_compr_kernel{
       SharedMemHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the global memory.
-  decltype(Global) global_kernel{
+  using Global = DeduceKernelT<
+      SharedMemHistKernel<Accessor, false, false, false, kBlockThreads, kItemsPerThread>>;
+  Global global_kernel{
       SharedMemHistKernel<Accessor, false, false, false, kBlockThreads, kItemsPerThread>};
-  // Kernel for working with dense Ellpack using the shared memory.
-  decltype(SharedCompr) shared_compr_kernel{
+  // Kernel for working with compressed sparse Ellpack using the shared memory.
+  using SharedCompr = DeduceKernelT<
+      SharedMemHistKernel<Accessor, true, false, true, kBlockThreads, kItemsPerThread>>;
+  SharedCompr shared_compr_kernel{
       SharedMemHistKernel<Accessor, true, false, true, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the shared memory.
-  decltype(Shared) shared_kernel{
+  using Shared = DeduceKernelT<
+      SharedMemHistKernel<Accessor, false, false, true, kBlockThreads, kItemsPerThread>>;
+  Shared shared_kernel{
       SharedMemHistKernel<Accessor, false, false, true, kBlockThreads, kItemsPerThread>};
-  decltype(GlobalDense) global_dense_kernel{
+  // Kernel for working with compressed dense ellpack using the global memory
+  using GlobalDense = DeduceKernelT<
+      SharedMemHistKernel<Accessor, true, true, false, kBlockThreads, kItemsPerThread>>;
+  GlobalDense global_dense_kernel{
       SharedMemHistKernel<Accessor, true, true, false, kBlockThreads, kItemsPerThread>};
-  decltype(SharedDense) shared_dense_kernel{
+  // Kernel for working with compressed dense ellpack using the shared memory
+  using SharedDense = DeduceKernelT<
+      SharedMemHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>>;
+  SharedDense shared_dense_kernel{
       SharedMemHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>};
 
   bool shared{false};
diff --git a/tests/cpp/common/test_cuda_host_allocator.cu b/tests/cpp/common/test_cuda_host_allocator.cu
index 4e3224bd8a88..3e4b57ff182e 100644
--- a/tests/cpp/common/test_cuda_host_allocator.cu
+++ b/tests/cpp/common/test_cuda_host_allocator.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>  // for Context
@@ -28,8 +28,16 @@ TEST(CudaHostMalloc, Managed) {
   std::vector<float, common::cuda_impl::ManagedAllocator<float>> vec;
   vec.resize(10);
 #if defined(__linux__)
+#if (CUDA_VERSION / 1000) >= 13
+  cudaMemLocation loc;
+  loc.type = cudaMemLocationTypeDevice;
+  loc.id = 0;
+  dh::safe_cuda(
+      cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), loc, 0, dh::DefaultStream()));
+#else
   dh::safe_cuda(
       cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), 0, dh::DefaultStream()));
+#endif  // (CUDA_VERSION / 1000) >= 13
 #endif
   dh::DefaultStream().Sync();
 }
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index 11a0d1b7125f..25b3c9bac4f6 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -114,7 +114,7 @@ TEST(TestVirtualMem, Version) {
 #if defined(xgboost_IS_WIN)
   ASSERT_FALSE(pinned.IsVm());
 #else  // defined(xgboost_IS_WIN)
-  if (major >= 12 && minor >= 5) {
+  if (major == 12 && minor >= 5 || major > 12) {
     ASSERT_TRUE(pinned.IsVm());
   } else {
     ASSERT_FALSE(pinned.IsVm());

From aea640db94645c8426f6467c8546cb1961f9003b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 7 Aug 2025 17:29:00 +0800
Subject: [PATCH 129/224] Hide inline functions. (#11612)

---
 src/data/cat_container.cu  | 12 ++++++++++++
 src/data/cat_container.cuh | 14 ++------------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/data/cat_container.cu b/src/data/cat_container.cu
index 9434e9fa6baf..d957089b8ea1 100644
--- a/src/data/cat_container.cu
+++ b/src/data/cat_container.cu
@@ -134,6 +134,18 @@ struct CatContainerImpl {
     that->Finalize();
   }
 };
+
+[[nodiscard]] std::tuple<CatAccessor, dh::DeviceUVector<std::int32_t>> MakeCatAccessor(
+    Context const* ctx, enc::DeviceColumnsView const& new_enc, CatContainer const* orig_cats) {
+  dh::DeviceUVector<std::int32_t> mapping(new_enc.n_total_cats);
+  auto d_sorted_idx = orig_cats->RefSortedIndex(ctx);
+  auto orig_enc = orig_cats->DeviceView(ctx);
+  enc::Recode(EncPolicy, orig_enc, d_sorted_idx, new_enc, dh::ToSpan(mapping));
+  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
+  auto cats_mapping = enc::MappingView{new_enc.feature_segments, dh::ToSpan(mapping)};
+  auto acc = CatAccessor{cats_mapping};
+  return std::tuple{acc, std::move(mapping)};
+}
 }  // namespace cuda_impl
 
 CatContainer::CatContainer()  // NOLINT
diff --git a/src/data/cat_container.cuh b/src/data/cat_container.cuh
index ab5586b42ae2..b0a81ff2f15c 100644
--- a/src/data/cat_container.cuh
+++ b/src/data/cat_container.cuh
@@ -4,7 +4,6 @@
 #pragma once
 #include "../common/device_helpers.cuh"  // for ToSpan
 #include "../common/device_vector.cuh"   // for device_vector, XGBDeviceAllocator
-#include "../encoder/ordinal.cuh"        // for Recode
 #include "../encoder/ordinal.h"          // for CatCharT
 #include "cat_container.h"               // for EncErrorPolicy
 
@@ -73,15 +72,6 @@ using EncPolicyT = enc::Policy<EncErrorPolicy, EncThrustPolicy>;
 
 inline EncPolicyT EncPolicy = EncPolicyT{};
 
-inline auto MakeCatAccessor(Context const* ctx, enc::DeviceColumnsView const& new_enc,
-                            CatContainer const* orig_cats) {
-  dh::DeviceUVector<std::int32_t> mapping(new_enc.n_total_cats);
-  auto d_sorted_idx = orig_cats->RefSortedIndex(ctx);
-  auto orig_enc = orig_cats->DeviceView(ctx);
-  enc::Recode(EncPolicy, orig_enc, d_sorted_idx, new_enc, dh::ToSpan(mapping));
-  CHECK_EQ(new_enc.feature_segments.size(), orig_enc.feature_segments.size());
-  auto cats_mapping = enc::MappingView{new_enc.feature_segments, dh::ToSpan(mapping)};
-  auto acc = CatAccessor{cats_mapping};
-  return std::tuple{acc, std::move(mapping)};
-}
+[[nodiscard]] std::tuple<CatAccessor, dh::DeviceUVector<std::int32_t>> MakeCatAccessor(
+    Context const* ctx, enc::DeviceColumnsView const& new_enc, CatContainer const* orig_cats);
 }  // namespace xgboost::cuda_impl

From f25f74d76cadaf2cbd45892fe279eb7c4a75a60d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 7 Aug 2025 18:12:43 +0800
Subject: [PATCH 130/224] [enc][dask] Support training continuation. (#11609)

---
 demo/guide-python/cat_pipeline.py             |   5 +
 doc/python/python_api.rst                     |   8 +
 doc/tutorials/categorical.rst                 | 103 +++++++++++--
 python-package/xgboost/_data_utils.py         |   6 +-
 python-package/xgboost/collective.py          |  16 +-
 python-package/xgboost/core.py                |   4 +-
 python-package/xgboost/dask/__init__.py       |  52 ++-----
 python-package/xgboost/dask/data.py           | 118 +++++++++++----
 python-package/xgboost/data.py                |  11 +-
 python-package/xgboost/sklearn.py             | 140 +++++++++++-------
 python-package/xgboost/testing/dask.py        |  72 ++++++++-
 python-package/xgboost/training.py            |  10 +-
 src/data/adapter.h                            |   5 +
 src/data/device_adapter.cuh                   |   4 +
 src/data/ellpack_page_source.cu               |   6 +-
 src/data/extmem_quantile_dmatrix.cc           |   2 +-
 src/data/gradient_index_page_source.cc        |   5 +-
 src/data/iterative_dmatrix.cc                 |   8 +-
 src/data/iterative_dmatrix.cu                 |  12 +-
 src/data/proxy_dmatrix.cc                     |  11 +-
 src/data/proxy_dmatrix.cu                     |  10 +-
 src/data/proxy_dmatrix.cuh                    |  30 ++--
 src/data/proxy_dmatrix.h                      | 113 +++++++-------
 src/data/quantile_dmatrix.cc                  |  11 +-
 src/data/quantile_dmatrix.cu                  |   6 +-
 src/data/quantile_dmatrix.h                   |   2 +-
 src/data/simple_dmatrix.cc                    |  26 ++--
 src/data/simple_dmatrix.cu                    |  17 +--
 src/data/sparse_page_dmatrix.cc               |   4 +-
 src/data/sparse_page_source.cu                |   6 +-
 src/data/sparse_page_source.h                 |   4 +-
 src/learner.cc                                |  20 +--
 src/predictor/cpu_predictor.cc                |   2 +-
 src/predictor/gpu_predictor.cu                |   4 +-
 tests/cpp/data/test_proxy_dmatrix.cc          |   5 +-
 tests/python-gpu/test_gpu_updaters.py         |  22 +--
 tests/python/test_openmp.py                   |   1 +
 tests/python/test_quantile_dmatrix.py         |  10 +-
 .../test_gpu_with_dask/test_gpu_with_dask.py  |  32 ++--
 .../test_with_dask/test_with_dask.py          |  14 +-
 40 files changed, 612 insertions(+), 325 deletions(-)

diff --git a/demo/guide-python/cat_pipeline.py b/demo/guide-python/cat_pipeline.py
index 72e786edd19b..e4ec2a5cdcae 100644
--- a/demo/guide-python/cat_pipeline.py
+++ b/demo/guide-python/cat_pipeline.py
@@ -6,6 +6,11 @@
 training and inference. There are many ways to attain the same goal, this script can be
 used as a starting point.
 
+.. versionchanged:: 3.1
+
+    Start with 3.1, users don't need this for most of the cases. See :ref:`cat-recode`
+    for more info.
+
 See Also
 --------
 - :doc:`Tutorial </tutorials/categorical>`
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index 12516339540b..595b7f067a01 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -206,6 +206,14 @@ Collective
 
 .. autofunction:: xgboost.collective.init
 
+.. autofunction:: xgboost.collective.finalize
+
+.. autofunction:: xgboost.collective.get_rank
+
+.. autofunction:: xgboost.collective.get_world_size
+
+.. autoclass:: xgboost.collective.CommunicatorContext
+
 .. automodule:: xgboost.tracker
 
 .. autoclass:: xgboost.tracker.RabitTracker
\ No newline at end of file
diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index f0409aaa393f..03a3bbcc16d9 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -137,38 +137,109 @@ feature it's specified as ``"c"``.  The Dask module in XGBoost has the same inte
 :class:`dask.Array <dask.Array>` can also be used for categorical data. Lastly, the
 sklearn interface :py:class:`~xgboost.XGBRegressor` has the same parameter.
 
-****************
-Data Consistency
-****************
+.. _cat-recode:
 
-XGBoost accepts parameters to indicate which feature is considered categorical, either through the ``dtypes`` of a dataframe or through the ``feature_types`` parameter. However, XGBoost by itself doesn't store information on how categories are encoded in the first place. For instance, given an encoding schema that maps music genres to integer codes:
+********************************
+Auto-recoding (Data Consistency)
+********************************
+
+.. versionchanged:: 3.1
+
+  Starting with XGBoost 3.1, the *Python* interface can perform automatic re-coding for
+  new inputs.
+
+XGBoost accepts parameters to indicate which feature is considered categorical, either
+through the ``dtypes`` of a dataframe or through the ``feature_types`` parameter. However,
+except for the Python interface, XGBoost doesn't store the information about how
+categories are encoded in the first place. For instance, given an encoding schema that
+maps music genres to integer codes:
 
 .. code-block:: python
 
   {"acoustic": 0, "indie": 1, "blues": 2, "country": 3}
 
-XGBoost doesn't know this mapping from the input and hence cannot store it in the model. The mapping usually happens in the users' data engineering pipeline with column transformers like :py:class:`sklearn.preprocessing.OrdinalEncoder`. To make sure correct result from XGBoost, users need to keep the pipeline for transforming data consistent across training and testing data. One should watch out for errors like:
+Aside from the Python interface (R/Java/C, etc), XGBoost doesn't know this mapping from
+the input and hence cannot store it in the model. The mapping usually happens in the
+users' data engineering pipeline. To ensure the correct result from XGBoost, users need to
+keep the pipeline for transforming data consistent across training and testing data.
+
+Starting with 3.1, the *Python* interface can remember the encoding and perform recoding
+during inference and training continuation when the input is a dataframe (`pandas`,
+`cuDF`, `polars`, `pyarrow`, `modin`). The feature support focuses on basic usage. It has
+some restrictions on the types of inputs that can be accepted. First, category names
+must have one of the following types:
+
+- string
+- integer, from 8-bit to 64-bit, both signed and unsigned are supported.
+- 32-bit or 64-bit floating point
+
+Other category types are not supported. Second, the input types must be strictly
+consistent. For example, XGBoost will raise an error if the categorical columns in the
+training set are unsigned integers whereas the test dataset has signed integer columns. If
+you have categories that are not one of the supported types, you need to perform the
+re-coding using a pre-processing data transformer like the
+:py:class:`sklearn.preprocessing.OrdinalEncoder`. See
+:ref:`sphx_glr_python_examples_cat_pipeline.py` for a worked example using an ordinal
+encoder. To clarify, the type here refers to the type of the name of categories (called
+``Index`` in pandas):
+
+.. code-block:: python
+
+  # string type
+  {"acoustic": 0, "indie": 1, "blues": 2, "country": 3}
+  # integer type
+  {-1: 0, 1: 1, 3: 2, 7: 3}
+  # depending on the dataframe implementation, it can be signed or unsigned.
+  {5: 0, 1: 1, 3: 2, 7: 3}
+  # floating point type, both 32-bit and 64-bit are supported.
+  {-1.0: 0, 1.0: 1, 3.0: 2, 7.0: 3}
+
+Internally, XGBoost attempts to extract the categories from the dataframe inputs. For
+inference (predict), the re-coding happens on the fly and there's no data copy (baring
+some internal transformations performed by the dataframe itself). For training
+continuation however, re-coding requires some extra steps if you are using the native
+interface. The sklearn interface and the Dask interface can handle training continuation
+automatically. Last, please note that using the re-coder with the native interface is
+still experimental. It's ready for testing, but we want to observe the feature usage for a
+period of time and might make some breaking changes if needed. The following is a snippet
+of using the native interface:
 
 .. code-block:: python
 
-  X_train["genre"] = X_train["genre"].astype("category")
-  reg = xgb.XGBRegressor(enable_categorical=True).fit(X_train, y_train)
+  import pandas as pd
+
+  X = pd.DataFrame()
+  Xy = xgboost.QuantileDMatrix(X, y, enable_categorical=True)
+  booster = xgboost.train({}, Xy)
+
+  # XGBoost can handle re-coding for inference without user intervention
+  X_new = pd.DataFrame()
+  booster.inplace_predict(X_new)
+
+  # Get categories saved in the model for training continuation
+  categories = booster.get_categories()
+  # Use saved categories as a reference for re-coding.
+  # Training continuation requires a re-coded DMatrix, pass the categories as feature_types
+  Xy_new = xgboost.QuantileDMatrix(
+    X_new, y_new, feature_types=categories, enable_categorical=True, ref=Xy
+  )
+  booster_1 = xgboost.train({}, Xy_new, xgb_model=booster)
 
-  # invalid encoding
-  X_test["genre"] = X_test["genre"].astype("category")
-  reg.predict(X_test)
 
-In the above snippet, training data and test data are encoded separately, resulting in two different encoding schemas and invalid prediction result. See :ref:`sphx_glr_python_examples_cat_pipeline.py` for a worked example using ordinal encoder.
+No extra step is required for using the scikit-learn interface as long as the inputs are
+dataframes. During training continuation, XGBoost will either extract the categories from
+the previous model or use the categories from the new training dataset if the input model
+doesn't have the information.
 
 *************
 Miscellaneous
 *************
 
-By default, XGBoost assumes input categories are integers starting from 0 till the number
-of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
-values due to mistakes or missing values in training dataset. It can be negative value,
-integer values that can not be accurately represented by 32-bit floating point, or values
-that are larger than actual number of unique categories.  During training this is
+By default, XGBoost assumes input category codes are integers starting from 0 till the
+number of categories :math:`[0, n\_categories)`. However, user might provide inputs with
+invalid values due to mistakes or missing values in training dataset. It can be negative
+value, integer values that can not be accurately represented by 32-bit floating point, or
+values that are larger than actual number of unique categories.  During training this is
 validated but for prediction it's treated as the same as not-chosen category for
 performance reasons.
 
diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
index a6258be69b2f..4531848aa782 100644
--- a/python-package/xgboost/_data_utils.py
+++ b/python-package/xgboost/_data_utils.py
@@ -683,7 +683,11 @@ def __del__(self) -> None:
 def get_ref_categories(
     feature_types: Optional[Union[FeatureTypes, Categories]],
 ) -> Tuple[Optional[FeatureTypes], Optional[Categories]]:
-    """Get the optional reference categories from the input."""
+    """Get the optional reference categories from the `feature_types`. This is used by
+    various `DMatrix` where the `feature_types` is reused for specifying the reference
+    categories.
+
+    """
     if isinstance(feature_types, Categories):
         ref_categories = feature_types
         feature_types = None
diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py
index 5bd0aedeec7c..48c7574786dc 100644
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@@ -37,7 +37,8 @@ class Config:
         See `dmlc_timeout` in :py:meth:`init`. This is only used for communicators, not
         the tracker. They are different parameters since the timeout for tracker limits
         only the time for starting and finalizing the communication group, whereas the
-        timeout for communicators limits the time used for collective operations.
+        timeout for communicators limits the time used for collective operations, like
+        :py:meth:`allreduce`.
 
     tracker_host_ip : See :py:class:`~xgboost.tracker.RabitTracker`.
 
@@ -94,7 +95,8 @@ def init(**args: _ArgVals) -> None:
           - federated_client_cert: Client certificate file path. Only needed for the SSL
             mode.
 
-        Use upper case for environment variables, use lower case for runtime configuration.
+        Use upper case for environment variables, use lower case for runtime
+        configuration.
 
     """
     _check_call(_LIB.XGCommunicatorInit(make_jcargs(**args)))
@@ -122,17 +124,17 @@ def get_world_size() -> int:
 
     Returns
     -------
-    n : int
+    n :
         Total number of process.
     """
     ret = _LIB.XGCommunicatorGetWorldSize()
     return ret
 
 
-def is_distributed() -> int:
+def is_distributed() -> bool:
     """If the collective communicator is distributed."""
     is_dist = _LIB.XGCommunicatorIsDistributed()
-    return is_dist
+    return bool(is_dist)
 
 
 def communicator_print(msg: Any) -> None:
@@ -160,8 +162,8 @@ def get_processor_name() -> str:
 
     Returns
     -------
-    name : str
-        the name of processor(host)
+    name :
+        The name of processor(host)
     """
     name_str = ctypes.c_char_p()
     _check_call(_LIB.XGCommunicatorGetProcessorName(ctypes.byref(name_str)))
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 912768393724..0aa4e39ee8af 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1361,12 +1361,12 @@ def get_categories(self, export_to_arrow: bool = False) -> Categories:
 
         .. warning::
 
-            This function is still working in progress.
+            This function is experimental.
 
         Parameters
         ----------
         export_to_arrow :
-            The returned container will contain a list to ``pyarrow`` arrays for the
+            The returned container will contain a list of ``pyarrow`` arrays for the
             categories. See the :py:meth:`~Categories.to_arrow` for more info.
 
         """
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 53bd112a0403..82f8b5c44c26 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -89,6 +89,7 @@
 from packaging.version import parse as parse_version
 
 from .. import collective, config
+from .._data_utils import Categories
 from .._typing import FeatureNames, FeatureTypes, IterationRange
 from ..callback import TrainingCallback
 from ..collective import Config as CollConfig
@@ -122,7 +123,7 @@
 )
 from ..tracker import RabitTracker
 from ..training import train as worker_train
-from .data import _create_dmatrix, _create_quantile_dmatrix, no_group_split
+from .data import _get_dmatrices, no_group_split
 from .utils import get_address_from_user, get_n_threads
 
 _DaskCollection: TypeAlias = Union[da.Array, dd.DataFrame, dd.Series]
@@ -331,6 +332,10 @@ def __init__(
 
         self.feature_names = feature_names
         self.feature_types = feature_types
+        if isinstance(feature_types, Categories):
+            raise TypeError(
+                "The Dask interface can handle categories from DataFrame automatically."
+            )
         self.missing = missing if missing is not None else numpy.nan
         self.enable_categorical = enable_categorical
 
@@ -652,12 +657,6 @@ def _create_fn_args(self, worker_addr: str) -> Dict[str, Any]:
         return args
 
 
-def _dmatrix_from_list_of_parts(is_quantile: bool, **kwargs: Any) -> DMatrix:
-    if is_quantile:
-        return _create_quantile_dmatrix(**kwargs)
-    return _create_dmatrix(**kwargs)
-
-
 async def _get_rabit_args(
     client: "distributed.Client",
     n_workers: int,
@@ -735,37 +734,6 @@ async def _check_workers_are_alive(
         raise RuntimeError(f"Missing required workers: {missing_workers}")
 
 
-def _get_dmatrices(
-    train_ref: dict,
-    train_id: int,
-    *refs: dict,
-    evals_id: Sequence[int],
-    evals_name: Sequence[str],
-    n_threads: int,
-) -> Tuple[DMatrix, List[Tuple[DMatrix, str]]]:
-    # Create training DMatrix
-    Xy = _dmatrix_from_list_of_parts(**train_ref, nthread=n_threads)
-    # Create evaluation DMatrices
-    evals: List[Tuple[DMatrix, str]] = []
-    for i, ref in enumerate(refs):
-        # Same DMatrix as the training
-        if evals_id[i] == train_id:
-            evals.append((Xy, evals_name[i]))
-            continue
-        if ref.get("ref", None) is not None:
-            if ref["ref"] != train_id:
-                raise ValueError(
-                    "The training DMatrix should be used as a reference to evaluation"
-                    " `QuantileDMatrix`."
-                )
-            del ref["ref"]
-            eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads, ref=Xy)
-        else:
-            eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
-        evals.append((eval_Xy, evals_name[i]))
-    return Xy, evals
-
-
 async def _train_async(
     *,
     client: "distributed.Client",
@@ -817,6 +785,8 @@ def do_train(  # pylint: disable=too-many-positional-arguments
                 evals_id=evals_id,
                 evals_name=evals_name,
                 n_threads=n_threads,
+                # We need the model for reference categories.
+                model=xgb_model,
             )
 
             booster = worker_train(
@@ -1934,7 +1904,7 @@ class DaskXGBRanker(XGBRankerMixIn, DaskScikitLearnBase):
     def __init__(
         self,
         *,
-        objective: str = "rank:pairwise",
+        objective: str = "rank:ndcg",
         allow_group_split: bool = False,
         coll_cfg: Optional[CollConfig] = None,
         **kwargs: Any,
@@ -2051,8 +2021,8 @@ def check_ser(
         ) -> TypeGuard[Optional[dd.Series]]:
             if not isinstance(qid, dd.Series) and qid is not None:
                 raise TypeError(
-                    f"When `allow_group_split` is set to False, {name} is required to be"
-                    " a series."
+                    f"When `allow_group_split` is set to False, {name} is required to "
+                    "be a series."
                 )
             return True
 
diff --git a/python-package/xgboost/dask/data.py b/python-package/xgboost/dask/data.py
index 2a80874e9821..a216b2ebe14f 100644
--- a/python-package/xgboost/dask/data.py
+++ b/python-package/xgboost/dask/data.py
@@ -23,10 +23,13 @@
 from dask import dataframe as dd
 
 from .. import collective as coll
-from .._typing import FeatureNames
+from .._data_utils import Categories
+from .._typing import FeatureNames, FeatureTypes
 from ..compat import concat, import_cupy
-from ..core import DataIter, DMatrix, QuantileDMatrix
+from ..core import Booster, DataIter, DMatrix, QuantileDMatrix
 from ..data import is_on_cuda
+from ..sklearn import get_model_categories, pick_ref_categories
+from ..training import _RefError
 
 LOGGER = logging.getLogger("[xgboost.dask]")
 
@@ -50,7 +53,7 @@ def __init__(
         self,
         data: List[Any],
         feature_names: Optional[FeatureNames] = None,
-        feature_types: Optional[Union[Any, List[Any]]] = None,
+        feature_types: Optional[Union[FeatureTypes, Categories]] = None,
         feature_weights: Optional[Any] = None,
         **kwargs: Optional[List[Any]],
     ) -> None:
@@ -251,6 +254,7 @@ def map_fn(i: int) -> pd.DataFrame:
 
 
 def _get_worker_parts(list_of_parts: _DataParts) -> Dict[str, List[Any]]:
+    """Convert list of dictionaries into a dictionary of lists."""
     assert isinstance(list_of_parts, list)
     result: Dict[str, List[Any]] = {}
 
@@ -275,6 +279,19 @@ def append(i: int, name: str) -> None:
     return result
 
 
+def _extract_data(
+    parts: _DataParts,
+    model: Optional[Booster],
+    feature_types: Optional[FeatureTypes],
+    xy_cats: Optional[Categories],
+) -> Tuple[Dict[str, List[Any]], Optional[Union[FeatureTypes, Categories]]]:
+    unzipped_dict = _get_worker_parts(parts)
+    X = unzipped_dict["data"][0]
+    _, model_cats = get_model_categories(X, model, feature_types)
+    model_cats = pick_ref_categories(X, model_cats, xy_cats)
+    return unzipped_dict, model_cats
+
+
 def _get_is_cuda(parts: Optional[_DataParts]) -> bool:
     if parts is not None:
         is_cuda = is_on_cuda(parts[0].get("data"))
@@ -294,10 +311,15 @@ def _make_empty(is_cuda: bool) -> np.ndarray:
     return empty
 
 
+def _warn_empty() -> None:
+    worker = distributed.get_worker()
+    LOGGER.warning("Worker %s has an empty DMatrix.", worker.address)
+
+
 def _create_quantile_dmatrix(
     *,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[Union[Any, List[Any]]],
+    feature_types: Optional[FeatureTypes],
     feature_weights: Optional[Any],
     missing: float,
     nthread: int,
@@ -306,11 +328,12 @@ def _create_quantile_dmatrix(
     enable_categorical: bool,
     max_quantile_batches: Optional[int],
     ref: Optional[DMatrix] = None,
+    model: Optional[Booster],
+    Xy_cats: Optional[Categories],
 ) -> QuantileDMatrix:
-    worker = distributed.get_worker()
     is_cuda = _get_is_cuda(parts)
     if parts is None:
-        LOGGER.warning("Worker %s has an empty DMatrix.", worker.address)
+        _warn_empty()
         return QuantileDMatrix(
             _make_empty(is_cuda),
             feature_names=feature_names,
@@ -321,14 +344,15 @@ def _create_quantile_dmatrix(
             max_quantile_batches=max_quantile_batches,
         )
 
-    it = DaskPartitionIter(
-        **_get_worker_parts(parts),
-        feature_types=feature_types,
-        feature_names=feature_names,
-        feature_weights=feature_weights,
-    )
+    unzipped_dict, model_cats = _extract_data(parts, model, feature_types, Xy_cats)
+
     return QuantileDMatrix(
-        it,
+        DaskPartitionIter(
+            **unzipped_dict,
+            feature_types=model_cats,
+            feature_names=feature_names,
+            feature_weights=feature_weights,
+        ),
         missing=missing,
         nthread=nthread,
         max_bin=max_bin,
@@ -341,12 +365,14 @@ def _create_quantile_dmatrix(
 def _create_dmatrix(  # pylint: disable=too-many-locals
     *,
     feature_names: Optional[FeatureNames],
-    feature_types: Optional[Union[Any, List[Any]]],
+    feature_types: Optional[FeatureTypes],
     feature_weights: Optional[Any],
     missing: float,
     nthread: int,
     enable_categorical: bool,
     parts: Optional[_DataParts],
+    model: Optional[Booster],
+    Xy_cats: Optional[Categories],
 ) -> DMatrix:
     """Get data that local to worker from DaskDMatrix.
 
@@ -355,20 +381,15 @@ def _create_dmatrix(  # pylint: disable=too-many-locals
     A DMatrix object.
 
     """
-    worker = distributed.get_worker()
-    list_of_parts = parts
     is_cuda = _get_is_cuda(parts)
-
-    if list_of_parts is None:
-        msg = f"Worker {worker.address} has an empty DMatrix."
-        LOGGER.warning(msg)
-        Xy = DMatrix(
+    if parts is None:
+        _warn_empty()
+        return DMatrix(
             _make_empty(is_cuda),
             feature_names=feature_names,
             feature_types=feature_types,
             enable_categorical=enable_categorical,
         )
-        return Xy
 
     T = TypeVar("T")
 
@@ -377,19 +398,64 @@ def concat_or_none(data: Sequence[Optional[T]]) -> Optional[T]:
             return None
         return concat(data)
 
-    unzipped_dict = _get_worker_parts(list_of_parts)
+    unzipped_dict, model_cats = _extract_data(parts, model, feature_types, Xy_cats)
+
     concated_dict: Dict[str, Any] = {}
     for key, value in unzipped_dict.items():
         v = concat_or_none(value)
         concated_dict[key] = v
 
-    Xy = DMatrix(
+    return DMatrix(
         **concated_dict,
         missing=missing,
         feature_names=feature_names,
-        feature_types=feature_types,
+        feature_types=model_cats,
         nthread=nthread,
         enable_categorical=enable_categorical,
         feature_weights=feature_weights,
     )
-    return Xy
+
+
+def _dmatrix_from_list_of_parts(is_quantile: bool, **kwargs: Any) -> DMatrix:
+    if is_quantile:
+        return _create_quantile_dmatrix(**kwargs)
+    return _create_dmatrix(**kwargs)
+
+
+def _get_dmatrices(
+    train_ref: dict,
+    train_id: int,
+    *refs: dict,
+    evals_id: Sequence[int],
+    evals_name: Sequence[str],
+    n_threads: int,
+    model: Optional[Booster],
+) -> Tuple[DMatrix, List[Tuple[DMatrix, str]]]:
+    # Create the training DMatrix
+    Xy = _dmatrix_from_list_of_parts(
+        **train_ref, nthread=n_threads, model=model, Xy_cats=None
+    )
+
+    # Create evaluation DMatrices
+    evals: List[Tuple[DMatrix, str]] = []
+    Xy_cats = Xy.get_categories()
+
+    for i, ref in enumerate(refs):
+        # Same DMatrix as the training
+        if evals_id[i] == train_id:
+            evals.append((Xy, evals_name[i]))
+            continue
+        # Check whether the training DMatrix has been used as a reference.
+        if ref.get("ref", None) is not None:
+            if ref["ref"] != train_id:
+                raise ValueError(_RefError)
+            del ref["ref"]  # Avoid duplicated parameter in the next fn call.
+            eval_xy = _dmatrix_from_list_of_parts(
+                **ref, nthread=n_threads, ref=Xy, Xy_cats=Xy_cats, model=model
+            )
+        else:
+            eval_xy = _dmatrix_from_list_of_parts(
+                **ref, nthread=n_threads, Xy_cats=Xy_cats, model=model
+            )
+        evals.append((eval_xy, evals_name[i]))
+    return Xy, evals
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 3eb0736be326..ad774eaf83c3 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -1691,7 +1691,16 @@ def _proxy_transform(
 
 def is_on_cuda(data: Any) -> bool:
     """Whether the data is a CUDA-based data structure."""
-    return any(p(data) for p in (_is_cudf_df, _is_cudf_ser, _is_cupy_alike, _is_dlpack))
+    return any(
+        p(data)
+        for p in (
+            _is_cudf_df,
+            _is_cudf_ser,
+            _is_cudf_pandas,
+            _is_cupy_alike,
+            _is_dlpack,
+        )
+    )
 
 
 def dispatch_proxy_set_data(
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 8572da21392b..589c696c0009 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -89,7 +89,7 @@ def _check_rf_callback(
     if early_stopping_rounds is not None or callbacks is not None:
         raise NotImplementedError(
             "`early_stopping_rounds` and `callbacks` are not implemented for"
-            " random forest."
+            " the sklearn random forest estimator interface."
         )
 
 
@@ -98,25 +98,6 @@ def _can_use_qdm(tree_method: Optional[str], device: Optional[str]) -> bool:
     return tree_method in ("hist", None, "auto") and not_sycl
 
 
-def get_ref_categories(
-    X: ArrayLike,
-    model: Optional[Union[Booster, str]],
-    feature_types: Optional[FeatureTypes],
-) -> Tuple[Optional[Union[Booster, str]], Optional[Union[FeatureTypes, Categories]]]:
-    """Configure the reference categories."""
-    if model is None or not is_dataframe(X):
-        return model, feature_types
-
-    if isinstance(model, str):
-        model = Booster(model_file=model)
-
-    categories = model.get_categories()
-    if not categories.empty():
-        return model, categories
-
-    return model, feature_types
-
-
 class _SklObjWProto(Protocol):
     def __call__(
         self,
@@ -417,7 +398,7 @@ def task(i: int) -> float:
         .. versionadded:: 1.7.0
 
         Used for specifying feature types without constructing a dataframe. See
-        :py:class:`DMatrix` for details.
+        the :py:class:`DMatrix` for details.
 
     feature_weights : Optional[ArrayLike]
 
@@ -634,6 +615,65 @@ def adddoc(cls: TDoc) -> TDoc:
     return adddoc
 
 
+def get_model_categories(
+    X: ArrayLike,
+    model: Optional[Union[Booster, str]],
+    feature_types: Optional[FeatureTypes],
+) -> Tuple[Optional[Union[Booster, str]], Optional[Union[FeatureTypes, Categories]]]:
+    """Extract the optional reference categories from the booster. Used for training
+    continuation. The result should be passed to the :py:func:`pick_ref_categories`.
+
+    """
+    # Skip if it's not a dataframe as there's no new encoding to be recoded.
+    #
+    # This function helps override the `feature_types` parameter. The `feature_types`
+    # from user is not useful when input is a dataframe as the real feature type should
+    # be encoded into the DF.
+    if model is None or not is_dataframe(X):
+        return model, feature_types
+
+    if isinstance(model, str):
+        model = Booster(model_file=model)
+
+    categories = model.get_categories()
+    if not categories.empty():
+        # override the `feature_types`.
+        return model, categories
+    # Convert empty into None.
+    return model, feature_types
+
+
+def pick_ref_categories(
+    X: Any,
+    model_cats: Optional[Union[FeatureTypes, Categories]],
+    Xy_cats: Optional[Categories],
+) -> Optional[Union[FeatureTypes, Categories]]:
+    """Use the reference categories from the model. If none, then use the reference
+    categories from the training DMatrix.
+
+    Parameters
+    ----------
+    X :
+        Input feature matrix.
+
+    model_cats :
+        Optional categories stored in the previous model (training continuation). This
+        should come from the :py:func:`get_model_categories`.
+
+    Xy_cats :
+        Optional categories from the training DMatrix. Used for re-coding the validation
+        dataset.
+
+    """
+    categories: Optional[Categories] = None
+    if not isinstance(model_cats, Categories) and is_dataframe(X):
+        categories = Xy_cats
+    if categories is not None and not categories.empty():
+        model_cats = categories
+
+    return model_cats
+
+
 def _wrap_evaluation_matrices(
     *,
     missing: float,
@@ -653,8 +693,11 @@ def _wrap_evaluation_matrices(
     enable_categorical: bool,
     feature_types: Optional[Union[FeatureTypes, Categories]],
 ) -> Tuple[Any, List[Tuple[Any, str]]]:
-    """Convert array_like evaluation matrices into DMatrix.  Perform validation on the
-    way."""
+    """Convert array_like evaluation matrices into DMatrix. Perform sanity checks on the
+    way.
+
+    """
+    # Feature_types contains the optional reference categories from the booster object.
     train_dmatrix = create_dmatrix(
         data=X,
         label=y,
@@ -697,7 +740,7 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
 
         evals = []
         for i, (valid_X, valid_y) in enumerate(eval_set):
-            # Skip the duplicated entry.
+            # Skip the entry if it's the training DMatrix.
             if all(
                 (
                     valid_X is X,
@@ -709,28 +752,23 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
                 )
             ):
                 evals.append(train_dmatrix)
-            else:
-                categories = None
-                if not isinstance(feature_types, Categories) and is_dataframe(X):
-                    # No reference categories from a previous model, use the one in the
-                    # training DMatrix.
-                    categories = Xy_cats
-                if categories is not None and not categories.empty():
-                    feature_types = categories
-
-                m = create_dmatrix(
-                    data=valid_X,
-                    label=valid_y,
-                    weight=sample_weight_eval_set[i],
-                    group=eval_group[i],
-                    qid=eval_qid[i],
-                    base_margin=base_margin_eval_set[i],
-                    missing=missing,
-                    enable_categorical=enable_categorical,
-                    feature_types=feature_types,
-                    ref=train_dmatrix,
-                )
-                evals.append(m)
+                continue
+
+            feature_types = pick_ref_categories(valid_X, feature_types, Xy_cats)
+            m = create_dmatrix(
+                data=valid_X,
+                label=valid_y,
+                weight=sample_weight_eval_set[i],
+                group=eval_group[i],
+                qid=eval_qid[i],
+                base_margin=base_margin_eval_set[i],
+                missing=missing,
+                enable_categorical=enable_categorical,
+                feature_types=feature_types,
+                ref=train_dmatrix,
+            )
+            evals.append(m)
+
         nevals = len(evals)
         eval_names = [f"validation_{i}" for i in range(nevals)]
         evals = list(zip(evals, eval_names))
@@ -851,7 +889,8 @@ def __init__(
         if isinstance(self.feature_types, Categories):
             raise TypeError(
                 "If you are training with a prior model (training continuation), "
-                "XGBoost can automatically reuse the categories from that model."
+                "The scikit-learn interface can automatically reuse the categories from"
+                " that model."
             )
         self.feature_weights = feature_weights
         self.max_cat_to_onehot = max_cat_to_onehot
@@ -870,6 +909,7 @@ def _more_tags(self) -> Dict[str, bool]:
             tags["non_deterministic"] = True
 
         tags["categorical"] = self.enable_categorical
+        tags["string"] = self.enable_categorical
         return tags
 
     @staticmethod
@@ -1292,7 +1332,7 @@ def fit(
             model, metric, params, feature_weights = self._configure_fit(
                 xgb_model, params, feature_weights
             )
-            model, feature_types = get_ref_categories(X, model, self.feature_types)
+            model, feature_types = get_model_categories(X, model, self.feature_types)
 
             evals_result: EvalsLog = {}
             train_dmatrix, evals = _wrap_evaluation_matrices(
@@ -1734,7 +1774,7 @@ def fit(
             model, metric, params, feature_weights = self._configure_fit(
                 xgb_model, params, feature_weights
             )
-            model, feature_types = get_ref_categories(X, model, self.feature_types)
+            model, feature_types = get_model_categories(X, model, self.feature_types)
 
             evals_result: EvalsLog = {}
             train_dmatrix, evals = _wrap_evaluation_matrices(
@@ -2236,7 +2276,7 @@ def fit(
             model, metric, params, feature_weights = self._configure_fit(
                 xgb_model, params, feature_weights
             )
-            model, feature_types = get_ref_categories(X, model, self.feature_types)
+            model, feature_types = get_model_categories(X, model, self.feature_types)
 
             evals_result: EvalsLog = {}
             train_dmatrix, evals = _wrap_evaluation_matrices(
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index 87396b4cb0a3..5f14c90b4ae3 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -6,7 +6,7 @@
 import pandas as pd
 from dask import array as da
 from dask import dataframe as dd
-from distributed import Client, get_worker
+from distributed import Client, get_worker, wait
 from packaging.version import parse as parse_version
 from sklearn.datasets import make_classification
 
@@ -20,6 +20,8 @@
 from ..dask import _DASK_VERSION, _get_rabit_args
 from .data import make_batches
 from .data import make_categorical as make_cat_local
+from .ordinal import make_recoded
+from .utils import Device, assert_allclose
 
 
 def check_init_estimation_clf(
@@ -244,7 +246,12 @@ def check_no_group_split(client: Client, device: str) -> None:
         client, 1024, 128, n_query_groups=4, max_rel=5, device=device
     )
 
-    ltr = dxgb.DaskXGBRanker(allow_group_split=False, n_estimators=36, device=device)
+    ltr = dxgb.DaskXGBRanker(
+        allow_group_split=False,
+        n_estimators=36,
+        device=device,
+        objective="rank:pairwise",
+    )
     ltr.fit(
         X_tr,
         y_tr,
@@ -313,3 +320,64 @@ def pack(**kwargs: Any) -> dd.DataFrame:
     if onehot:
         return dd.get_dummies(X), y
     return X, y
+
+
+def run_recode(client: Client, device: Device) -> None:
+    """Run re-coding test with the Dask interface."""
+    enc, reenc, y, _, _ = make_recoded(device, n_features=96)
+    denc, dreenc, dy = (
+        dd.from_pandas(enc, npartitions=8),
+        dd.from_pandas(reenc, npartitions=8),
+        da.from_array(y, chunks=(y.shape[0] // 8,)),
+    )
+    wait([denc, dreenc, dy])
+    client.rebalance([denc, dreenc, dy])
+
+    if device == "cuda":
+        denc = denc.to_backend("cudf")
+        dreenc = dreenc.to_backend("cudf")
+        dy = dy.to_backend("cupy")
+
+    Xy = dxgb.DaskQuantileDMatrix(client, denc, dy, enable_categorical=True)
+    Xy_valid = dxgb.DaskQuantileDMatrix(
+        client, dreenc, dy, enable_categorical=True, ref=Xy
+    )
+    # Base model
+    results = dxgb.train(client, {"device": device}, Xy, evals=[(Xy_valid, "Valid")])
+
+    # Training continuation
+    Xy = dxgb.DaskQuantileDMatrix(client, denc, dy, enable_categorical=True)
+    Xy_valid = dxgb.DaskQuantileDMatrix(
+        client, dreenc, dy, enable_categorical=True, ref=Xy
+    )
+    results_1 = dxgb.train(
+        client,
+        {"device": device},
+        Xy,
+        evals=[(Xy_valid, "Valid")],
+        xgb_model=results["booster"],
+    )
+
+    # Reversed training continuation
+    Xy = dxgb.DaskQuantileDMatrix(client, dreenc, dy, enable_categorical=True)
+    Xy_valid = dxgb.DaskQuantileDMatrix(
+        client, denc, dy, enable_categorical=True, ref=Xy
+    )
+    results_2 = dxgb.train(
+        client,
+        {"device": device},
+        Xy,
+        evals=[(Xy_valid, "Valid")],
+        xgb_model=results["booster"],
+    )
+    np.testing.assert_allclose(
+        results_1["history"]["Valid"]["rmse"], results_2["history"]["Valid"]["rmse"]
+    )
+
+    predt_0 = dxgb.inplace_predict(client, results, denc).compute()
+    predt_1 = dxgb.inplace_predict(client, results, dreenc).compute()
+    assert_allclose(device, predt_0, predt_1)
+
+    predt_0 = dxgb.predict(client, results, Xy).compute()
+    predt_1 = dxgb.predict(client, results, Xy_valid).compute()
+    assert_allclose(device, predt_0, predt_1)
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index e4092538dd6c..125446981b42 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -42,6 +42,11 @@
 
 _CVFolds = Sequence["CVPack"]
 
+_RefError = (
+    "Training dataset should be used as a reference when constructing the "
+    "`QuantileDMatrix` for evaluation.",
+)
+
 
 @_deprecate_positional_args
 def train(
@@ -172,10 +177,7 @@ def train(
             and va.ref is not weakref.ref(dtrain)
             and va is not dtrain
         ):
-            raise ValueError(
-                "Training dataset should be used as a reference when constructing "
-                "the `QuantileDMatrix` for evaluation."
-            )
+            raise ValueError(_RefError)
 
     bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
     start_iteration = 0
diff --git a/src/data/adapter.h b/src/data/adapter.h
index ddf6cf49efe7..88b7650c75bc 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -484,6 +484,11 @@ inline auto MakeEncColumnarBatch(Context const* ctx, ColumnarAdapter const* adap
   return std::tuple{EncColumnarAdapterBatch{adapter->Columns(), acc}, std::move(mapping)};
 }
 
+inline auto MakeEncColumnarBatch(Context const* ctx,
+                                 std::shared_ptr<ColumnarAdapter> const& adapter) {
+  return MakeEncColumnarBatch(ctx, adapter.get());
+}
+
 class FileAdapterBatch {
  public:
   class Line {
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 7c60ea3e7bd4..03067b077127 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -148,6 +148,10 @@ inline auto MakeEncColumnarBatch(Context const* ctx, CudfAdapter const* adapter)
                     std::move(mapping)};
 }
 
+inline auto MakeEncColumnarBatch(Context const* ctx, std::shared_ptr<CudfAdapter> const& adapter) {
+  return MakeEncColumnarBatch(ctx, adapter.get());
+}
+
 class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
  public:
   explicit CupyAdapter(StringView cuda_interface_str) {
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index af422a37683e..2ce528e1711f 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -22,7 +22,7 @@
 #include "ellpack_page.cuh"                  // for EllpackPageImpl
 #include "ellpack_page.h"                    // for EllpackPage
 #include "ellpack_page_source.h"
-#include "proxy_dmatrix.cuh"  // for Dispatch
+#include "proxy_dmatrix.cuh"  // for DispatchAny
 #include "xgboost/base.h"     // for bst_idx_t
 
 namespace xgboost::data {
@@ -559,7 +559,7 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
   if (!this->ReadCache()) {
     auto iter = this->source_->Iter();
     CHECK_EQ(this->Iter(), iter);
-    cuda_impl::Dispatch(proxy_, [this](auto const& value) {
+    cuda_impl::DispatchAny(proxy_, [this](auto const& value) {
       CHECK(this->proxy_->Ctx()->IsCUDA()) << "All batches must use the same device type.";
       proxy_->Info().feature_types.SetDevice(dh::GetDevice(this->ctx_));
       auto d_feature_types = proxy_->Info().feature_types.ConstDeviceSpan();
@@ -587,7 +587,7 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
     LOG(DEBUG) << "Generated an Ellpack page with size: "
                << common::HumanMemUnit(this->page_->Impl()->MemCostBytes())
                << " from an batch with estimated size: "
-               << cuda_impl::Dispatch<false>(proxy_, [](auto const& adapter) {
+               << cuda_impl::DispatchAny<false>(proxy_, [](auto const& adapter) {
                     return common::HumanMemUnit(adapter->SizeBytes());
                   });
     this->page_->SetBaseRowId(this->ext_info_.base_rowids.at(iter));
diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc
index eacc98cd7b6e..9e0e3f6f0855 100644
--- a/src/data/extmem_quantile_dmatrix.cc
+++ b/src/data/extmem_quantile_dmatrix.cc
@@ -77,7 +77,7 @@ void ExtMemQuantileDMatrix::InitFromCPU(
 
   common::HistogramCuts cuts;
   ExternalDataInfo ext_info;
-  cpu_impl::GetDataShape(ctx, proxy, *iter, missing, &ext_info);
+  cpu_impl::GetDataShape(ctx, proxy, iter.get(), missing, &ext_info);
   ext_info.SetInfo(ctx, true, &this->info_);
 
   this->n_batches_ = ext_info.n_batches;
diff --git a/src/data/gradient_index_page_source.cc b/src/data/gradient_index_page_source.cc
index e8ec30700b95..65110e61454e 100644
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #include "gradient_index_page_source.h"
 
@@ -8,6 +8,7 @@
 
 #include "../common/hist_util.h"  // for HistogramCuts
 #include "gradient_index.h"       // for GHistIndexMatrix
+#include "proxy_dmatrix.h"        // for DispatchAny
 
 namespace xgboost::data {
 void GradientIndexPageSource::Fetch() {
@@ -33,7 +34,7 @@ void ExtGradientIndexPageSource::Fetch() {
   if (!this->ReadCache()) {
     CHECK_EQ(count_, source_->Iter());
     CHECK_NE(cuts_.Values().size(), 0);
-    HostAdapterDispatch(proxy_, [this](auto const& value) {
+    cpu_impl::DispatchAny(proxy_, [this](auto const& value) {
       CHECK(this->proxy_->Ctx()->IsCPU()) << "All batches must use the same device type.";
       auto h_feature_types = proxy_->Info().feature_types.ConstHostSpan();
       // This does three things:
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 2d6f7451d43d..a0d279c8e3ee 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -14,7 +14,7 @@
 #include "../tree/param.h"          // FIXME(jiamingy): Find a better way to share this parameter.
 #include "batch_utils.h"            // for RegenGHist
 #include "gradient_index.h"         // for GHistIndexMatrix
-#include "proxy_dmatrix.h"          // for DataIterProxy
+#include "proxy_dmatrix.h"          // for DataIterProxy, DispatchAny
 #include "quantile_dmatrix.h"       // for GetCutsFromRef
 #include "quantile_dmatrix.h"       // for GetDataShape, MakeSketches
 #include "simple_batch_iterator.h"  // for SimpleBatchIteratorImpl
@@ -65,7 +65,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
       DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_};
   common::HistogramCuts cuts;
   ExternalDataInfo ext_info;
-  cpu_impl::GetDataShape(ctx, proxy, iter, missing, &ext_info);
+  cpu_impl::GetDataShape(ctx, proxy, &iter, missing, &ext_info);
   ext_info.SetInfo(ctx, true, &this->info_);
 
   /**
@@ -82,7 +82,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
   std::size_t prev_sum = 0;
   std::size_t i = 0;
   while (iter.Next()) {
-    HostAdapterDispatch(proxy, [&](auto const& batch) {
+    cpu_impl::DispatchAny(proxy, [&](auto const& batch) {
       proxy->Info().num_nonzero_ = ext_info.batch_nnz[i];
       this->ghist_->PushAdapterBatch(ctx, rbegin, prev_sum, batch, missing, h_ft, p.sparse_thresh,
                                      Info().num_row_);
@@ -104,7 +104,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
    */
   bst_idx_t accumulated_rows = 0;
   while (iter.Next()) {
-    HostAdapterDispatch(proxy, [&](auto const& batch) {
+    cpu_impl::DispatchAny(proxy, [&](auto const& batch) {
       this->ghist_->PushAdapterBatchColumns(ctx, batch, missing, accumulated_rows);
     });
     accumulated_rows += BatchSamples(proxy);
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 444a679dadbd..17303b2ed20a 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -1,15 +1,15 @@
 /**
  * Copyright 2020-2025, XGBoost contributors
  */
-#include <memory>     // for shared_ptr
-#include <utility>    // for move
+#include <memory>   // for shared_ptr
+#include <utility>  // for move
 
 #include "batch_utils.h"  // for RegenGHist, CheckParam
 #include "device_adapter.cuh"
 #include "ellpack_page.cuh"
 #include "iterative_dmatrix.h"
-#include "proxy_dmatrix.cuh"
-#include "proxy_dmatrix.h"  // for BatchSamples, BatchColumns
+#include "proxy_dmatrix.cuh"  // for DispatchAny
+#include "proxy_dmatrix.h"    // for BatchSamples, BatchColumns
 #include "simple_batch_iterator.h"
 
 namespace xgboost::data {
@@ -68,14 +68,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     auto rows = BatchSamples(proxy);
     dh::device_vector<size_t> row_counts(rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    cuda_impl::Dispatch(proxy, [=](auto const& value) {
+    cuda_impl::DispatchAny(proxy, [=](auto const& value) {
       return GetRowCounts(ctx, value, row_counts_span, dh::GetDevice(ctx), missing);
     });
     auto is_dense = this->IsDense();
 
     proxy->Info().feature_types.SetDevice(dh::GetDevice(ctx));
     auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
-    auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
+    auto new_impl = cuda_impl::DispatchAny(proxy, [&](auto const& value) {
       return EllpackPageImpl{
           &fmat_ctx_,          value, missing, is_dense, row_counts_span, d_feature_types,
           ext_info.row_stride, rows,  cuts};
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index 33817363fda3..3db7656d7e8c 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -6,6 +6,7 @@
 
 #include <memory>       // for shared_ptr
 #include <type_traits>  // for is_same_v
+#include <utility>      // for move
 
 #include "../common/type.h"   // for GetValueT
 #include "adapter.h"          // for ColumnarAdapter
@@ -21,17 +22,17 @@
 namespace xgboost::data {
 void DMatrixProxy::SetColumnar(StringView data) {
   std::shared_ptr<ColumnarAdapter> adapter{new ColumnarAdapter{data}};
-  this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
+  this->batch_ = std::move(adapter);
   this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
 void DMatrixProxy::SetArray(StringView data) {
   std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{data}};
-  this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
+  this->batch_ = std::move(adapter);
   this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
@@ -40,9 +41,9 @@ void DMatrixProxy::SetCsr(char const *c_indptr, char const *c_indices, char cons
   CHECK(on_host) << "Not implemented on device.";
   std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
       StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
-  this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
+  this->batch_ = std::move(adapter);
   this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
@@ -80,7 +81,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
     common::AssertGPUSupport();
 #endif
   } else {
-    p_fmat = data::HostAdapterDispatch<false>(
+    p_fmat = data::cpu_impl::DispatchAny<false>(
         proxy.get(),
         [&](auto const &adapter) {
           auto p_fmat =
@@ -104,7 +105,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
     common::AssertGPUSupport();
 #endif
   }
-  return HostAdapterDispatch<false>(proxy, [&](auto const &adapter) {
+  return cpu_impl::DispatchAny<false>(proxy, [&](auto const &adapter) {
     using AdapterT = typename common::GetValueT<decltype(adapter)>::element_type;
     if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
       return adapter->HasRefCategorical();
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index bb0fe621d695..34d9221f2ece 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -40,7 +40,7 @@ namespace cuda_impl {
 std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
                                                 std::shared_ptr<DMatrixProxy> proxy,
                                                 float missing) {
-  return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
+  return DispatchAny<false>(proxy.get(), [&](auto const& adapter) {
     auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
     CHECK_EQ(p_fmat->Info().num_row_, adapter->NumRows());
     return p_fmat;
@@ -48,15 +48,15 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
 }
 
 [[nodiscard]] bst_idx_t BatchSamples(DMatrixProxy const* proxy) {
-  return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
+  return cuda_impl::DispatchAny(proxy, [](auto const& value) { return value.NumRows(); });
 }
 
 [[nodiscard]] bst_idx_t BatchColumns(DMatrixProxy const* proxy) {
-  return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
+  return cuda_impl::DispatchAny(proxy, [](auto const& value) { return value.NumCols(); });
 }
 
 [[nodiscard]] bool BatchCatsIsRef(DMatrixProxy const* proxy) {
-  return Dispatch<false>(proxy, [&](auto const& adapter) {
+  return DispatchAny<false>(proxy, [&](auto const& adapter) {
     using AdapterT = typename common::GetValueT<decltype(adapter)>::element_type;
     if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
       return adapter->HasRefCategorical();
@@ -66,7 +66,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
 }
 
 [[nodiscard]] enc::DeviceColumnsView BatchCats(DMatrixProxy const* proxy) {
-  return Dispatch<false>(proxy, [&](auto const& adapter) {
+  return DispatchAny<false>(proxy, [&](auto const& adapter) {
     using AdapterT = typename common::GetValueT<decltype(adapter)>::element_type;
     if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
       if (adapter->HasRefCategorical()) {
diff --git a/src/data/proxy_dmatrix.cuh b/src/data/proxy_dmatrix.cuh
index b71693b7a84d..06c516d43931 100644
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -8,29 +8,31 @@
 #include "proxy_dmatrix.h"
 
 namespace xgboost::data::cuda_impl {
-template <bool get_value = true, typename Fn>
-decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
+// See the cpu impl for parameter documentation.
+template <bool get_value = true, template <typename A> typename AddPtrT = std::shared_ptr,
+          typename Fn>
+decltype(auto) DispatchAny(Context const* ctx, std::any x, Fn&& fn, bool* type_error = nullptr) {
   auto has_type = [&] {
     if (type_error) {
       *type_error = false;
     }
   };
-  if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
+  if (x.type() == typeid(AddPtrT<CupyAdapter>)) {
     has_type();
     if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
+      auto value = std::any_cast<AddPtrT<CupyAdapter>>(x)->Value();
       return fn(value);
     } else {
-      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
+      auto value = std::any_cast<AddPtrT<CupyAdapter>>(x);
       return fn(value);
     }
-  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
+  } else if (x.type() == typeid(AddPtrT<CudfAdapter>)) {
     has_type();
-    auto adapter = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+    auto adapter = std::any_cast<AddPtrT<CudfAdapter>>(x);
     if constexpr (get_value) {
       auto value = adapter->Value();
       if (adapter->HasRefCategorical()) {
-        auto [batch, mapping] = MakeEncColumnarBatch(proxy->Ctx(), adapter.get());
+        auto [batch, mapping] = MakeEncColumnarBatch(ctx, adapter);
         return fn(batch);
       }
       return fn(value);
@@ -41,16 +43,22 @@ decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nul
     if (type_error) {
       *type_error = true;
     } else {
-      LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
+      LOG(FATAL) << "Unknown type: " << x.type().name();
     }
   }
 
+  // Dummy return value
   if constexpr (get_value) {
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+    auto value = std::any_cast<AddPtrT<CudfAdapter>>(x)->Value();
     return fn(value);
   } else {
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+    auto value = std::any_cast<AddPtrT<CudfAdapter>>(x);
     return fn(value);
   }
 }
+
+template <bool get_value = true, typename Fn>
+decltype(auto) DispatchAny(DMatrixProxy const* proxy, Fn&& fn, bool* type_error = nullptr) {
+  return DispatchAny<get_value>(proxy->Ctx(), proxy->Adapter(), std::forward<Fn>(fn), type_error);
+}
 }  // namespace xgboost::data::cuda_impl
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 5f5f9ed414a8..b2cf1e55dbf3 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -9,6 +9,7 @@
 #include <cstdint>      // for uint32_t, int32_t
 #include <memory>       // for shared_ptr
 #include <type_traits>  // for invoke_result_t, declval
+#include <utility>      // for forward
 #include <vector>       // for vector
 
 #include "../common/nvtx_utils.h"  // for xgboost_NVTX_FN_RANGE
@@ -41,8 +42,8 @@ class DataIterProxy {
       : iter_{iter}, reset_{reset}, next_{next} {}
   DataIterProxy(DataIterProxy&& that) = default;
   DataIterProxy& operator=(DataIterProxy&& that) = default;
-  DataIterProxy(DataIterProxy const& that) = default;
-  DataIterProxy& operator=(DataIterProxy const& that) = default;
+  DataIterProxy(DataIterProxy const& that) = delete;
+  DataIterProxy& operator=(DataIterProxy const& that) = delete;
 
   [[nodiscard]] bool Next() {
     xgboost_NVTX_FN_RANGE();
@@ -173,68 +174,71 @@ struct ExternalDataInfo {
   }
 };
 
+namespace cpu_impl {
 /**
- * @brief Dispatch function call based on input type.
+ * @brief Dispatch function call based on the input type.
  *
- * @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
+ * @tparam get_value Whether the funciton Fn accepts an adapter batch or the adapter itself.
+ * @tparam AddPtrT   The type of the adapter pointer. Use std::add_pointer_t for raw pointer.
  * @tparam Fn        The type of the function to be dispatched.
  *
- * @param proxy The proxy object holding the reference to the input.
+ * @param x     Any any object that contains a (shared) pointer to an adapter.
  * @param fn    The function to be dispatched.
  * @param type_error[out] Set to ture if it's not null and the input data is not recognized by
  *                        the host.
  *
  * @return The return value of the function being dispatched.
  */
-template <bool get_value = true, typename Fn>
-decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
+template <bool get_value = true, template <typename A> typename AddPtrT = std::shared_ptr,
+          typename Fn>
+decltype(auto) DispatchAny(Context const* ctx, std::any x, Fn&& fn, bool* type_error = nullptr) {
+  // CSC, FileAdapter, and IteratorAdapter are not supported.
   auto has_type = [&] {
     if (type_error) {
       *type_error = false;
     }
   };
-  CHECK(proxy->Adapter().has_value());
-  auto const& x = proxy->Adapter();
-  if (x.type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
+  CHECK(x.has_value());
+  if (x.type() == typeid(AddPtrT<data::DenseAdapter>)) {
+    has_type();
+    if constexpr (get_value) {
+      auto value = std::any_cast<AddPtrT<DenseAdapter>>(x)->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<AddPtrT<DenseAdapter>>(x);
+      fn(value);
+    }
+  } else if (x.type() == typeid(AddPtrT<ArrayAdapter>)) {
     has_type();
     if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(x)->Value();
+      auto value = std::any_cast<AddPtrT<ArrayAdapter>>(x)->Value();
       return fn(value);
     } else {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(x);
+      auto value = std::any_cast<AddPtrT<ArrayAdapter>>(x);
       return fn(value);
     }
-  } else if (x.type() == typeid(std::shared_ptr<ArrayAdapter>)) {
+  } else if (x.type() == typeid(AddPtrT<CSRArrayAdapter>)) {
     has_type();
     if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(x)->Value();
+      auto value = std::any_cast<AddPtrT<CSRArrayAdapter>>(x)->Value();
       return fn(value);
     } else {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(x);
+      auto value = std::any_cast<AddPtrT<CSRArrayAdapter>>(x);
       return fn(value);
     }
-  } else if (x.type() == typeid(std::shared_ptr<ColumnarAdapter>)) {
+  } else if (x.type() == typeid(AddPtrT<ColumnarAdapter>)) {
     has_type();
-    auto adapter = std::any_cast<std::shared_ptr<ColumnarAdapter>>(x);
+    auto adapter = std::any_cast<AddPtrT<ColumnarAdapter>>(x);
     if constexpr (get_value) {
       auto value = adapter->Value();
       if (adapter->HasRefCategorical()) {
-        auto [batch, mapping] = MakeEncColumnarBatch(proxy->Ctx(), adapter.get());
+        auto [batch, mapping] = MakeEncColumnarBatch(ctx, adapter);
         return fn(batch);
       }
       return fn(value);
     } else {
       return fn(adapter);
     }
-  } else if (x.type() == typeid(std::shared_ptr<data::DenseAdapter>)) {
-    has_type();
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<DenseAdapter>>(x)->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<DenseAdapter>>(x);
-      fn(value);
-    }
   } else {
     if (type_error) {
       *type_error = true;
@@ -244,13 +248,36 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
   }
 
   if constexpr (get_value) {
-    return std::invoke_result_t<Fn,
-                                decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value())>();
+    return std::invoke_result_t<Fn, decltype(std::declval<AddPtrT<ArrayAdapter>>()->Value())>();
   } else {
-    return std::invoke_result_t<Fn, decltype(std::declval<std::shared_ptr<ArrayAdapter>>())>();
+    return std::invoke_result_t<Fn, decltype(std::declval<AddPtrT<ArrayAdapter>>())>();
   }
 }
 
+template <bool get_value = true, typename Fn>
+decltype(auto) DispatchAny(DMatrixProxy const* proxy, Fn&& fn, bool* type_error = nullptr) {
+  return DispatchAny<get_value>(proxy->Ctx(), proxy->Adapter(), std::forward<Fn>(fn), type_error);
+}
+
+/**
+ * @brief Get categories for the current batch.
+ *
+ * @return A host view to the categories
+ */
+[[nodiscard]] inline decltype(auto) BatchCats(DMatrixProxy const* proxy) {
+  return DispatchAny<false>(proxy, [](auto const& adapter) -> decltype(auto) {
+    using AdapterT = typename std::remove_reference_t<decltype(adapter)>::element_type;
+    if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
+      if (adapter->HasRefCategorical()) {
+        return adapter->RefCats();
+      }
+      return adapter->Cats();
+    }
+    return enc::HostColumnsView{};
+  });
+}
+}  // namespace cpu_impl
+
 /**
  * @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
  *
@@ -274,7 +301,7 @@ namespace cuda_impl {
 [[nodiscard]] inline bst_idx_t BatchSamples(DMatrixProxy const* proxy) {
   bool type_error = false;
   auto n_samples =
-      HostAdapterDispatch(proxy, [](auto const& value) { return value.NumRows(); }, &type_error);
+      cpu_impl::DispatchAny(proxy, [](auto const& value) { return value.NumRows(); }, &type_error);
   if (type_error) {
     n_samples = cuda_impl::BatchSamples(proxy);
   }
@@ -287,34 +314,14 @@ namespace cuda_impl {
 [[nodiscard]] inline bst_feature_t BatchColumns(DMatrixProxy const* proxy) {
   bool type_error = false;
   auto n_features =
-      HostAdapterDispatch(proxy, [](auto const& value) { return value.NumCols(); }, &type_error);
+      cpu_impl::DispatchAny(proxy, [](auto const& value) { return value.NumCols(); }, &type_error);
   if (type_error) {
     n_features = cuda_impl::BatchColumns(proxy);
   }
   return n_features;
 }
 
-namespace cpu_impl {
-/**
- * @brief Get categories for the current batch.
- *
- * @param ref_if_avail Use the reference categories if present.
- *
- * @return A host view to the categories
- */
-[[nodiscard]] inline decltype(auto) BatchCats(DMatrixProxy const* proxy) {
-  return HostAdapterDispatch<false>(proxy, [](auto const& adapter) -> decltype(auto) {
-    using AdapterT = typename std::remove_reference_t<decltype(adapter)>::element_type;
-    if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
-      if (adapter->HasRefCategorical()) {
-        return adapter->RefCats();
-      }
-      return adapter->Cats();
-    }
-    return enc::HostColumnsView{};
-  });
-}
-}  // namespace cpu_impl
+namespace cpu_impl {}  // namespace cpu_impl
 [[nodiscard]] bool BatchCatsIsRef(DMatrixProxy const* proxy);
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
diff --git a/src/data/quantile_dmatrix.cc b/src/data/quantile_dmatrix.cc
index e8cdbce16751..2c30784719cf 100644
--- a/src/data/quantile_dmatrix.cc
+++ b/src/data/quantile_dmatrix.cc
@@ -9,6 +9,7 @@
 #include "../collective/communicator-inl.h"  // for IsDistributed
 #include "../common/error_msg.h"             // for InconsistentCategories
 #include "../common/threading_utils.h"       // for ParallelFor
+#include "proxy_dmatrix.h"                   // for DispatchAny
 #include "cat_container.h"                   // for CatContainer
 #include "gradient_index.h"                  // for GHistIndexMatrix
 #include "xgboost/collective/result.h"       // for SafeColl
@@ -84,13 +85,13 @@ void SyncFeatureType(Context const* ctx, std::vector<FeatureType>* p_h_ft) {
 }
 
 void GetDataShape(Context const* ctx, DMatrixProxy* proxy,
-                  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter, float missing,
+                  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>* iter, float missing,
                   ExternalDataInfo* p_info) {
   auto& info = *p_info;
 
   auto const is_valid = data::IsValidFunctor{missing};
   auto nnz_cnt = [&]() {
-    return HostAdapterDispatch(proxy, [&](auto const& value) {
+    return DispatchAny(proxy, [&](auto const& value) {
       bst_idx_t n_threads = ctx->Threads();
       bst_idx_t n_features = info.column_sizes.size();
       linalg::Tensor<bst_idx_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
@@ -139,8 +140,8 @@ void GetDataShape(Context const* ctx, DMatrixProxy* proxy,
     info.nnz += info.batch_nnz.back();
     info.accumulated_rows += batch_size;
     info.n_batches++;
-  } while (iter.Next());
-  iter.Reset();
+  } while (iter->Next());
+  iter->Reset();
 
   std::partial_sum(info.base_rowids.cbegin(), info.base_rowids.cend(), info.base_rowids.begin());
 }
@@ -165,7 +166,7 @@ void MakeSketches(Context const* ctx,
         p_sketch = std::make_unique<common::HostSketchContainer>(
             ctx, p.max_bin, h_ft, ext_info.column_sizes, !proxy->Info().group_ptr_.empty());
       }
-      HostAdapterDispatch(proxy, [&](auto const& batch) {
+      DispatchAny(proxy, [&](auto const& batch) {
         proxy->Info().num_nonzero_ = ext_info.batch_nnz[i];
         // We don't need base row idx here as Info is from proxy and the number of rows in
         // it is consistent with data batch.
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index ab5052b3875a..fd3d5b251d32 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -17,7 +17,7 @@
 #include "../common/quantile.cuh"       // for SketchContainer
 #include "cat_container.h"              // for CatContainer
 #include "ellpack_page.cuh"             // for EllpackPage
-#include "proxy_dmatrix.cuh"            // for Dispatch
+#include "proxy_dmatrix.cuh"            // for DispatchAny
 #include "proxy_dmatrix.h"              // for DataIterProxy
 #include "quantile_dmatrix.h"           // for GetCutsFromRef
 
@@ -115,7 +115,7 @@ void MakeSketches(Context const* ctx,
         lazy_init_sketch();  // Add a new level.
       }
       proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
-      Dispatch(proxy, [&](auto const& value) {
+      DispatchAny(proxy, [&](auto const& value) {
         common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing,
                                     sketches.back().first.get());
         sketches.back().second++;
@@ -129,7 +129,7 @@ void MakeSketches(Context const* ctx,
     dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
     ext_info.row_stride =
-        std::max(ext_info.row_stride, Dispatch(proxy, [=](auto const& value) {
+        std::max(ext_info.row_stride, DispatchAny(proxy, [=](auto const& value) {
                    return GetRowCounts(ctx, value, row_counts_span, dh::GetDevice(ctx), missing);
                  }));
     ext_info.nnz += thrust::reduce(ctx->CUDACtx()->CTP(), row_counts.begin(), row_counts.end());
diff --git a/src/data/quantile_dmatrix.h b/src/data/quantile_dmatrix.h
index 2021201f0382..5d0a58ad720b 100644
--- a/src/data/quantile_dmatrix.h
+++ b/src/data/quantile_dmatrix.h
@@ -91,7 +91,7 @@ void SyncFeatureType(Context const *ctx, std::vector<FeatureType> *p_h_ft);
  * @brief Fetch the external data shape.
  */
 void GetDataShape(Context const *ctx, DMatrixProxy *proxy,
-                  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter, float missing,
+                  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> *iter, float missing,
                   ExternalDataInfo *p_info);
 
 /**
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 8adbad991d70..9855d3ca3c54 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -21,6 +21,7 @@
 #include "cat_container.h"  // for CatContainer
 #include "ellpack_page.h"   // for EllpackPage
 #include "gradient_index.h"
+#include "proxy_dmatrix.h"  // for DispatchAny
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"
 
@@ -231,32 +232,31 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
                              DataSplitMode data_split_mode) {
   Context ctx;
   ctx.Init(Args{{"nthread", std::to_string(nthread)}});
-
   std::vector<uint64_t> qids;
   uint64_t default_max = std::numeric_limits<uint64_t>::max();
   uint64_t last_group_id = default_max;
   bst_uint group_size = 0;
   auto& offset_vec = sparse_page_->offset.HostVector();
   auto& data_vec = sparse_page_->data.HostVector();
+  // batch_size is either number of rows or cols, depending on data layout
   uint64_t inferred_num_columns = 0;
   uint64_t total_batch_size = 0;
-  // batch_size is either number of rows or cols, depending on data layout
-
-  auto push_page = [&](auto const& batch) {
-    if constexpr (std::is_same_v<AdapterT, ColumnarAdapter>) {
-      if (adapter->HasRefCategorical()) {
-        auto [enc_batch, mapping] = MakeEncColumnarBatch(&ctx, adapter);
-        return sparse_page_->Push(enc_batch, missing, ctx.Threads());
-      }
-    }
-    return sparse_page_->Push(batch, missing, ctx.Threads());
-  };
 
   adapter->BeforeFirst();
   // Iterate over batches of input data
   while (adapter->Next()) {
+    bool type_error = false;
+    auto push = [&](auto const& batch) {
+      return sparse_page_->Push(batch, missing, ctx.Threads());
+    };
+    bst_idx_t batch_max_columns =
+        cpu_impl::DispatchAny<true, std::add_pointer_t>(&ctx, adapter, push, &type_error);
     auto& batch = adapter->Value();
-    bst_idx_t batch_max_columns = push_page(batch);
+    if (type_error) {
+      // Not supported by the dispatch function.
+      batch_max_columns = push(batch);
+    }
+
     inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
     total_batch_size += batch.Size();
     // Append meta information if available
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index 5da056e4b1af..ede1b4b48a83 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -8,6 +8,7 @@
 #include "../common/cuda_rt_utils.h"  // for CurrentDevice, SetDevice
 #include "cat_container.h"            // for CatContainer
 #include "device_adapter.cuh"
+#include "proxy_dmatrix.cuh"  // for DispatchAny
 #include "simple_dmatrix.cuh"
 #include "simple_dmatrix.h"
 #include "xgboost/context.h"  // for Context
@@ -40,19 +41,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
   // Enforce single batch
   CHECK(!adapter->Next());
 
-  auto copy_page = [&] {
-    if constexpr (std::is_same_v<AdapterT, CudfAdapter>) {
-      if (adapter->HasRefCategorical()) {
-        auto [batch, mapping] = MakeEncColumnarBatch(&ctx, adapter);
-        info_.num_nonzero_ = CopyToSparsePage(&ctx, batch, device, missing, sparse_page_.get());
-        return;
-      }
-    }
-    info_.num_nonzero_ =
-        CopyToSparsePage(&ctx, adapter->Value(), device, missing, sparse_page_.get());
-  };
-
-  copy_page();
+  cuda_impl::DispatchAny<true, std::add_pointer_t>(&ctx, adapter, [&](auto const& batch) {
+    info_.num_nonzero_ = CopyToSparsePage(&ctx, batch, device, missing, sparse_page_.get());
+  });
   info_.num_col_ = adapter->NumColumns();
   info_.num_row_ = adapter->NumRows();
 
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 2b3f800c81d7..b261fda7f264 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -114,8 +114,8 @@ void SparsePageDMatrix::InitializeSparsePage(Context const *ctx) {
   // During initialization, the n_batches is 0.
   CHECK_EQ(this->ext_info_.n_batches, static_cast<decltype(this->ext_info_.n_batches)>(0));
   sparse_page_source_ = std::make_shared<SparsePageSource>(
-      iter, proxy, this->missing_, ctx->Threads(), this->info_.num_col_, this->ext_info_.n_batches,
-      cache_info_.at(id));
+      std::move(iter), proxy, this->missing_, ctx->Threads(), this->info_.num_col_,
+      this->ext_info_.n_batches, cache_info_.at(id));
 }
 
 BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl(Context const *ctx) {
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index c60f0fb0569d..e480737ccd2a 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -1,8 +1,8 @@
 /**
- * Copyright 2021-2024, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include "../common/device_helpers.cuh"  // for CurrentDevice
-#include "proxy_dmatrix.cuh"             // for Dispatch, DMatrixProxy
+#include "proxy_dmatrix.cuh"             // for DispatchAny, DMatrixProxy
 #include "simple_dmatrix.cuh"            // for CopyToSparsePage
 #include "sparse_page_source.h"
 #include "xgboost/data.h"  // for SparsePage
@@ -16,7 +16,7 @@ void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
   CHECK(device.IsCUDA());
   auto ctx = Context{}.MakeCUDA(device.ordinal);
 
-  cuda_impl::Dispatch(
+  cuda_impl::DispatchAny(
       proxy, [&](auto const &value) { CopyToSparsePage(&ctx, value, device, missing, page); });
 }
 }  // namespace xgboost::data
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 1584c407964b..f311cbb48a43 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -459,7 +459,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
     if (!this->ReadCache()) {
       bool type_error{false};
       CHECK(proxy_);
-      HostAdapterDispatch(
+      cpu_impl::DispatchAny(
           proxy_,
           [&](auto const& adapter_batch) {
             page_->Push(adapter_batch, this->missing_, this->nthreads_);
@@ -481,7 +481,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
                    DMatrixProxy* proxy, float missing, int nthreads, bst_feature_t n_features,
                    bst_idx_t n_batches, std::shared_ptr<Cache> cache)
       : SparsePageSourceImpl(missing, nthreads, n_features, cache),
-        iter_{iter},
+        iter_{std::move(iter)},
         proxy_{proxy},
         n_batches_{n_batches} {
     if (!cache_info_->written) {
diff --git a/src/learner.cc b/src/learner.cc
index f2c71fc2bb71..5aebe4bd55c8 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -41,7 +41,7 @@
 #include "common/random.h"                // for GlobalRandom
 #include "common/timer.h"                 // for Monitor
 #include "common/version.h"               // for Version
-#include "xgboost/base.h"                 // for Args, bst_float, GradientPair, bst_feature_t, ...
+#include "xgboost/base.h"                 // for Args, GradientPair, bst_feature_t
 #include "xgboost/context.h"              // for Context
 #include "xgboost/data.h"                 // for DMatrix, MetaInfo
 #include "xgboost/gbm.h"                  // for GradientBooster
@@ -83,15 +83,11 @@ T& UsePtr(T& ptr) {  // NOLINT
  */
 struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy> {
   /* \brief global bias */
-  bst_float base_score{ObjFunction::DefaultBaseScore()};
+  float base_score{ObjFunction::DefaultBaseScore()};
   /* \brief number of features  */
   bst_feature_t num_feature{0};
   /* \brief number of classes, if it is multi-class classification  */
   std::int32_t num_class{0};
-  /*! \brief Model contain additional properties */
-  int32_t contain_extra_attrs{0};
-  /*! \brief Model contain eval metrics */
-  int32_t contain_eval_metrics{0};
   /*! \brief the version of XGBoost. */
   std::int32_t major_version{std::get<0>(Version::Self())};
   std::int32_t minor_version{std::get<1>(Version::Self())};
@@ -582,7 +578,6 @@ class LearnerConfiguration : public Learner {
 
   void SetAttr(const std::string& key, const std::string& value) override {
     attributes_[key] = value;
-    mparam_.contain_extra_attrs = 1;
   }
 
   bool GetAttr(const std::string& key, std::string* out) const override {
@@ -781,7 +776,6 @@ class LearnerConfiguration : public Learner {
       auto DupCheck = [&name](std::unique_ptr<Metric> const& m) { return m->Name() != name; };
       if (std::all_of(metrics_.begin(), metrics_.end(), DupCheck)) {
         metrics_.emplace_back(std::unique_ptr<Metric>(Metric::Create(name, &ctx_)));
-        mparam_.contain_eval_metrics = 1;
       }
     }
 
@@ -1129,9 +1123,9 @@ class LearnerImpl : public LearnerIO {
   }
 
   void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
-               HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
-               bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
-               bool approx_contribs, bool pred_interactions) override {
+               HostDeviceVector<float>* out_preds, bst_layer_t layer_begin, bst_layer_t layer_end,
+               bool training, bool pred_leaf, bool pred_contribs, bool approx_contribs,
+               bool pred_interactions) override {
     int multiple_predictions = static_cast<int>(pred_leaf) +
                                static_cast<int>(pred_interactions) +
                                static_cast<int>(pred_contribs);
@@ -1248,8 +1242,8 @@ class LearnerImpl : public LearnerIO {
   }
 
  private:
-  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
-                   std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
+  void GetGradient(HostDeviceVector<float> const& preds, MetaInfo const& info, std::int32_t iter,
+                   linalg::Matrix<GradientPair>* out_gpair) {
     out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
     collective::ApplyWithLabels(&ctx_, info, out_gpair->Data(),
                                 [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index f8be3bd49bac..874b98a7af38 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -806,7 +806,7 @@ class CPUPredictor : public Predictor {
     CHECK(!p_m->Info().IsColumnSplit())
         << "Inplace predict support for column-wise data split is not yet implemented.";
     bool type_error = false;
-    data::HostAdapterDispatch<false>(
+    data::cpu_impl::DispatchAny<false>(
         proxy,
         [&](auto x) {
           using AdapterT = typename decltype(x)::element_type;
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index d86639947bf0..a056890ca501 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -22,7 +22,7 @@
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
-#include "../data/proxy_dmatrix.cuh"
+#include "../data/proxy_dmatrix.cuh"  // for DispatchAny
 #include "../gbm/gbtree_model.h"
 #include "predict_fn.h"
 #include "xgboost/data.h"
@@ -1162,7 +1162,7 @@ class GPUPredictor : public xgboost::Predictor {
     auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
     CHECK(proxy) << error::InplacePredictProxy();
     bool type_error = false;
-    data::cuda_impl::Dispatch<false>(proxy, [&](auto x) {
+    data::cuda_impl::DispatchAny<false>(proxy, [&](auto x) {
       using AdapterT = typename decltype(x)::element_type;
       this->DispatchedInplacePredict<AdapterT>(x, p_m, model, missing, out_preds, tree_begin,
                                                tree_end);
diff --git a/tests/cpp/data/test_proxy_dmatrix.cc b/tests/cpp/data/test_proxy_dmatrix.cc
index c70e95053e20..63688067e49c 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cc
+++ b/tests/cpp/data/test_proxy_dmatrix.cc
@@ -21,10 +21,11 @@ TEST(ProxyDMatrix, HostData) {
       RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
 
   proxy.SetArray(data.c_str());
+  using cpu_impl::DispatchAny;
 
-  auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
+  auto n_samples = DispatchAny(&proxy, [](auto const &value) { return value.Size(); });
   ASSERT_EQ(n_samples, kRows);
-  auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); });
+  auto n_features = DispatchAny(&proxy, [](auto const &value) { return value.NumCols(); });
   ASSERT_EQ(n_features, kCols);
 }
 }  // namespace xgboost::data
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 25e6eb1a4a7e..958f7e6c80cb 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -279,18 +279,20 @@ def test_gpu_hist_device_dmatrix(
         tm.make_dataset_strategy(),
     )
     @settings(deadline=None, max_examples=10, print_blob=True)
-    def test_external_memory(self, param, num_rounds, dataset):
-        if dataset.name.endswith("-l1"):
-            return
+    def test_external_memory(
+        self, param: Dict[str, Any], num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
-        param["tree_method"] = "hist"
-        param["device"] = "cuda"
-        param = dataset.set_params(param)
-        m = dataset.get_external_dmat()
-        external_result = train_result(param, m, num_rounds)
-        del m
-        assert tm.non_increasing(external_result["train"][dataset.metric])
+
+        with xgb.config_context(use_rmm=True):
+            param["tree_method"] = "hist"
+            param["device"] = "cuda"
+            param = dataset.set_params(param)
+            m = dataset.get_external_dmat()
+            external_result = train_result(param, m, num_rounds)
+            del m
+            assert tm.non_increasing(external_result["train"][dataset.metric])
 
     def test_empty_dmatrix_prediction(self):
         # FIXME(trivialfis): This should be done with all updaters
diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py
index 82b0ba270caa..4710426b0e52 100644
--- a/tests/python/test_openmp.py
+++ b/tests/python/test_openmp.py
@@ -80,6 +80,7 @@ def consist_test(title, n):
         assert np.array_equal(auc_1, auc_3)
 
     @pytest.mark.skipif(**tm.no_sklearn())
+    @pytest.mark.timeout(30)
     def test_with_omp_thread_limit(self):
         args = [
             "python", os.path.join(
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 0abef50feeb7..7883a1417cd1 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -399,5 +399,11 @@ def test_sparse_predict(self) -> None:
     def test_cv_error(self) -> None:
         X, y = make_sparse_regression(8, 2, sparsity=0.2, as_dense=False)
         Xy = xgb.QuantileDMatrix(X, y)
-        with pytest.raises(ValueError, match=""):
-            cv = xgb.cv({}, Xy, 10, nfold=10, early_stopping_rounds=10)
+        with pytest.raises(ValueError):
+            xgb.cv({}, Xy, 10, nfold=10, early_stopping_rounds=10)
+
+
+def test_feature_types() -> None:
+    it = IteratorForTest(*make_batches(32, 8, 4, False), cache=None)
+    with pytest.raises(ValueError, match="specified as batch argument"):
+        xgb.QuantileDMatrix(it, feature_types=["q"] * 8)
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 6aba66a02317..ef2ab52c15b5 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -15,7 +15,7 @@
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.collective import CommunicatorContext
-from xgboost.testing.dask import get_rabit_args, make_categorical
+from xgboost.testing.dask import get_rabit_args, make_categorical, run_recode
 from xgboost.testing.params import hist_parameter_strategy
 
 from ..test_with_dask.test_with_dask import (
@@ -251,15 +251,6 @@ def test_dask_dataframe(self, local_cuda_client: Client) -> None:
         run_with_dask_dataframe(dxgb.DaskDMatrix, local_cuda_client)
         run_with_dask_dataframe(dxgb.DaskQuantileDMatrix, local_cuda_client)
 
-    @pytest.mark.skipif(**tm.no_dask_cudf())
-    def test_categorical(self, local_cuda_client: Client) -> None:
-        X, y = make_categorical(local_cuda_client, 10000, 30, 13)
-        X = X.to_backend("cudf")
-
-        X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, onehot=True)
-        X_onehot = X_onehot.to_backend("cudf")
-        run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
-
     @given(
         params=hist_parameter_strategy,
         num_rounds=strategies.integers(1, 20),
@@ -509,7 +500,11 @@ def test_data_initialization(self, local_cuda_client: Client) -> None:
 
         def worker_fn(worker_addr: str, data_ref: Dict) -> None:
             with dxgb.CommunicatorContext(**rabit_args):
-                local_dtrain = dxgb._dmatrix_from_list_of_parts(**data_ref, nthread=7)
+                from xgboost.dask.data import _dmatrix_from_list_of_parts
+
+                local_dtrain = _dmatrix_from_list_of_parts(
+                    **data_ref, nthread=7, model=None, Xy_cats=None
+                )
                 fw_rows = local_dtrain.get_float_info("feature_weights").shape[0]
                 assert fw_rows == local_dtrain.num_col()
 
@@ -591,6 +586,21 @@ def comp_dm_ddm(dm_names: List[str], ddm_names: List[str]) -> None:
             assert rn == drn
 
 
+@pytest.mark.skipif(**tm.no_dask_cudf())
+def test_categorical(local_cuda_client: Client) -> None:
+    X, y = make_categorical(local_cuda_client, 10000, 30, 13)
+    X = X.to_backend("cudf")
+
+    X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, onehot=True)
+    X_onehot = X_onehot.to_backend("cudf")
+    run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
+
+
+@pytest.mark.skipif(**tm.no_dask_cudf())
+def test_recode(local_cuda_client: Client) -> None:
+    run_recode(local_cuda_client, "cuda")
+
+
 @pytest.mark.skipif(**tm.no_cupy())
 def test_with_asyncio(local_cuda_client: Client) -> None:
     address = local_cuda_client.scheduler.address
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 92c5781ef242..1c0eab14e4f7 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -36,6 +36,7 @@
     check_uneven_nan,
     get_rabit_args,
     make_categorical,
+    run_recode,
 )
 from xgboost.testing.params import hist_cache_strategy, hist_parameter_strategy
 from xgboost.testing.shared import (
@@ -314,6 +315,10 @@ def test_categorical(client: "Client") -> None:
     assert reg.get_booster().feature_types == ft
 
 
+def test_recode(client: "Client") -> None:
+    run_recode(client, "cpu")
+
+
 def test_dask_predict_shape_infer(client: "Client") -> None:
     X, y = make_classification(n_samples=kRows, n_informative=5, n_classes=3)
     X_ = dd.from_array(X, chunksize=100)
@@ -1716,9 +1721,14 @@ def test_no_duplicated_partition(self) -> None:
                 n_workers = len(workers)
 
                 def worker_fn(worker_addr: str, data_ref: Dict) -> None:
+                    from xgboost.dask.data import _dmatrix_from_list_of_parts
+
                     with dxgb.CommunicatorContext(**rabit_args):
-                        local_dtrain = dxgb._dmatrix_from_list_of_parts(
-                            **data_ref, nthread=7
+                        local_dtrain = _dmatrix_from_list_of_parts(
+                            **data_ref,
+                            nthread=7,
+                            model=None,
+                            Xy_cats=None,
                         )
                         total = np.array([local_dtrain.num_row()])
                         total = xgb.collective.allreduce(total, xgb.collective.Op.SUM)

From 50d4321fa696d322ee64aa0d693ebc7fbad14c34 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Thu, 7 Aug 2025 16:10:58 +0200
Subject: [PATCH 131/224] [sycl] Fix build with updated compiler (#11618)

Co-authored-by: Dmitry Razdoburdin <>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22e8b219dd88..41f1b1f48a9c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -364,7 +364,7 @@ if(PLUGIN_SYCL)
   set(CMAKE_CXX_LINK_EXECUTABLE
       "icpx <FLAGS> <CMAKE_CXX_LINK_FLAGS> -qopenmp <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
   set(CMAKE_CXX_CREATE_SHARED_LIBRARY
-      "icpx <CMAKE_SHARED_LIBRARY_CXX_FLAGS> -qopenmp <LANGUAGE_COMPILE_FLAGS> \
+      "icpx -shared <CMAKE_SHARED_LIBRARY_CXX_FLAGS> -qopenmp <LANGUAGE_COMPILE_FLAGS> \
       <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG>,<TARGET_SONAME> \
       -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
 endif()

From 3f56c33d96f2099a3ec92f8b5caebb03b37fc838 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Thu, 7 Aug 2025 22:57:19 +0200
Subject: [PATCH 132/224] [sycl] Reduce predictor memory consumption and
 improve L2 locality (#11603)

---------

Co-authored-by: Dmitry Razdoburdin <>
---
 plugin/sycl/common/hist_util.cc             |  10 +-
 plugin/sycl/common/hist_util.h              |   2 +-
 plugin/sycl/device_properties.h             |  66 ++++
 plugin/sycl/predictor/predictor.cc          | 320 +++++++++++++++-----
 plugin/sycl/tree/hist_dispatcher.h          |  46 +--
 tests/cpp/plugin/test_sycl_ghist_builder.cc |   2 +-
 6 files changed, 314 insertions(+), 132 deletions(-)
 create mode 100644 plugin/sycl/device_properties.h

diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
index 5b96d8f5c98e..37567b8e75b3 100644
--- a/plugin/sycl/common/hist_util.cc
+++ b/plugin/sycl/common/hist_util.cc
@@ -321,7 +321,7 @@ ::sycl::event BuildHistDispatchKernel(
                 GHistRow<FPType, MemoryType::on_device>* hist,
                 bool isDense,
                 GHistRow<FPType, MemoryType::on_device>* hist_buffer,
-                const tree::DeviceProperties& device_prop,
+                const DeviceProperties& device_prop,
                 ::sycl::event events_priv,
                 bool force_atomic_use) {
   const size_t size = row_indices.Size();
@@ -373,7 +373,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const GHistIndexMatrix& gmat, const bool isDense,
                             GHistRow<FPType, MemoryType::on_device>* hist,
                             GHistRow<FPType, MemoryType::on_device>* hist_buffer,
-                            const tree::DeviceProperties& device_prop,
+                            const DeviceProperties& device_prop,
                             ::sycl::event event_priv,
                             bool force_atomic_use) {
   const bool is_dense = isDense;
@@ -409,7 +409,7 @@ ::sycl::event GHistBuilder<GradientSumT>::BuildHist(
               GHistRowT<MemoryType::on_device>* hist,
               bool isDense,
               GHistRowT<MemoryType::on_device>* hist_buffer,
-              const tree::DeviceProperties& device_prop,
+              const DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use) {
   return BuildHistKernel<GradientSumT>(qu_, gpair, row_indices, gmat,
@@ -426,7 +426,7 @@ ::sycl::event GHistBuilder<float>::BuildHist(
               GHistRow<float, MemoryType::on_device>* hist,
               bool isDense,
               GHistRow<float, MemoryType::on_device>* hist_buffer,
-              const tree::DeviceProperties& device_prop,
+              const DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use);
 template
@@ -437,7 +437,7 @@ ::sycl::event GHistBuilder<double>::BuildHist(
               GHistRow<double, MemoryType::on_device>* hist,
               bool isDense,
               GHistRow<double, MemoryType::on_device>* hist_buffer,
-              const tree::DeviceProperties& device_prop,
+              const DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use);
 
diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
index c2148c6a612e..3c71a7be20d9 100644
--- a/plugin/sycl/common/hist_util.h
+++ b/plugin/sycl/common/hist_util.h
@@ -162,7 +162,7 @@ class GHistBuilder {
                           GHistRowT<MemoryType::on_device>* HistCollection,
                           bool isDense,
                           GHistRowT<MemoryType::on_device>* hist_buffer,
-                          const tree::DeviceProperties& device_prop,
+                          const DeviceProperties& device_prop,
                           ::sycl::event event,
                           bool force_atomic_use = false);
 
diff --git a/plugin/sycl/device_properties.h b/plugin/sycl/device_properties.h
new file mode 100644
index 000000000000..0b0bc90fbff4
--- /dev/null
+++ b/plugin/sycl/device_properties.h
@@ -0,0 +1,66 @@
+/*!
+ * Copyright 2017-2025 by Contributors
+ * \file device_properties.h
+ */
+#ifndef PLUGIN_SYCL_DEVICE_PROPERTIES_H_
+#define PLUGIN_SYCL_DEVICE_PROPERTIES_H_
+
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/device_architecture.hpp>
+#include "../../src/common/common.h"               // for HumanMemUnit
+
+namespace xgboost {
+namespace sycl {
+
+class DeviceProperties {
+  void GetL2Size(const ::sycl::device& device) {
+    l2_size = device.get_info<::sycl::info::device::global_mem_cache_size>();
+    LOG(INFO) << "Detected L2 Size = " << ::xgboost::common::HumanMemUnit(l2_size);
+    l2_size_per_eu = static_cast<float>(l2_size) / max_compute_units;
+  }
+
+  void GetSRAMSize(const ::sycl::device& device) {
+    auto arch =
+      device.get_info<::sycl::ext::oneapi::experimental::info::device::architecture>();
+    size_t eu_per_core =
+      device.get_info<::sycl::ext::intel::info::device::gpu_eu_count_per_subslice>();
+    switch (arch) {
+      case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc: {
+        LOG(INFO) << "Xe-HPC (Ponte Vecchio) Architecture. L1 friendly optimization enabled.";
+        size_t l1_size = 512 * 1024;
+        size_t registers_size = 64 * 1024;
+        sram_size_per_eu = l1_size  / eu_per_core + registers_size;
+        break;
+      }
+      default:
+        sram_size_per_eu = 0;
+    }
+  }
+
+ public:
+  bool is_gpu;
+  bool usm_host_allocations;
+  size_t max_compute_units;
+  size_t max_work_group_size;
+  size_t sub_group_size;
+  float sram_size_per_eu = 0;
+  size_t l2_size = 0;
+  float l2_size_per_eu = 0;
+
+  explicit DeviceProperties(const ::sycl::device& device):
+    is_gpu(device.is_gpu()),
+    usm_host_allocations(device.has(::sycl::aspect::usm_host_allocations)),
+    max_compute_units(device.get_info<::sycl::info::device::max_compute_units>()),
+    max_work_group_size(device.get_info<::sycl::info::device::max_work_group_size>()),
+    sub_group_size(device.get_info<::sycl::info::device::sub_group_sizes>().back()) {
+      GetL2Size(device);
+      if (is_gpu) {
+        GetSRAMSize(device);
+      }
+    }
+};
+
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_DEVICE_PROPERTIES_H_
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index bde2d96bd8e8..d1195625d90e 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -29,6 +29,7 @@
 #include "../../src/gbm/gbtree_model.h"
 
 #include "../device_manager.h"
+#include "../device_properties.h"
 
 namespace xgboost {
 namespace sycl {
@@ -129,20 +130,64 @@ class DeviceModel {
   }
 };
 
-float GetLeafWeight(const Node* nodes, const float* fval_buff, const uint8_t* miss_buff) {
-  const Node* node = nodes;
-  while (!node->IsLeaf()) {
-    if (miss_buff[node->GetFidx()] == 1) {
-      node = nodes + node->MissingIdx();
+// Binary search
+float BinarySearch(const Entry* begin_ptr, const Entry* end_ptr,
+                   size_t col_idx, size_t num_features) {
+  const size_t n_elems = end_ptr - begin_ptr;
+  if (n_elems == num_features) {
+    return (begin_ptr + col_idx)->fvalue;
+  }
+
+  // Since indexes are in range [0: num_features),
+  // we can squeeze the search window from [0: n_elems) to [offset_left: offset_right)
+  const size_t shift = (num_features - 1) - col_idx;
+  const size_t offset_left = shift > n_elems - 1 ? 0 : std::max<size_t>(0, (n_elems - 1) - shift);
+  const size_t offset_right = std::min<size_t>(col_idx + 1, n_elems);
+
+  end_ptr = begin_ptr + offset_right;
+  begin_ptr += offset_left;
+  const Entry* previous_middle = nullptr;
+  while (end_ptr != begin_ptr) {
+    const Entry* middle = begin_ptr + (end_ptr - begin_ptr) / 2;
+    if (middle == previous_middle) {
+      break;
     } else {
-      const float fvalue = fval_buff[node->GetFidx()];
-      if (fvalue < node->GetFvalue()) {
-        node = nodes + node->LeftChildIdx();
-      } else {
-        node = nodes + node->RightChildIdx();
-      }
+      previous_middle = middle;
+    }
+    if (middle->index == col_idx) {
+      return middle->fvalue;
+    } else if (middle->index < col_idx) {
+      begin_ptr = middle + 1;
+    } else {
+      end_ptr = middle;
     }
   }
+  return std::numeric_limits<float>::quiet_NaN();
+}
+
+size_t NextNodeIdx(float fvalue, const Node& node) {
+  if (std::isnan(fvalue)) {
+    return node.MissingIdx();
+  } else {
+    if (fvalue < node.GetFvalue()) {
+      return node.LeftChildIdx();
+    } else {
+      return node.RightChildIdx();
+    }
+  }
+}
+
+float GetLeafWeight(const Node* nodes, const Entry* first_entry,
+                    const Entry* last_entry, size_t num_features) {
+  size_t is_dense = (last_entry - first_entry == num_features);
+
+  const Node* node = nodes;
+  while (!node->IsLeaf()) {
+    const float fvalue = is_dense ?
+                         (first_entry + node->GetFidx())->fvalue :
+                         BinarySearch(first_entry, last_entry, node->GetFidx(), num_features);
+    node = nodes + NextNodeIdx(fvalue, *node);
+  }
   return node->GetWeight();
 }
 
@@ -150,11 +195,7 @@ float GetLeafWeight(const Node* nodes, const float* fval_buff) {
   const Node* node = nodes;
   while (!node->IsLeaf()) {
     const float fvalue = fval_buff[node->GetFidx()];
-    if (fvalue < node->GetFvalue()) {
-      node = nodes + node->LeftChildIdx();
-    } else {
-      node = nodes + node->RightChildIdx();
-    }
+    node = nodes + NextNodeIdx(fvalue, *node);
   }
   return node->GetWeight();
 }
@@ -191,14 +232,13 @@ class Predictor : public xgboost::Predictor {
       }
       out_preds->Fill(base_score);
     }
-    needs_buffer_update = true;
   }
 
   explicit Predictor(Context const* context) :
       xgboost::Predictor::Predictor{context},
-      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) {
-        qu_ = device_manager.GetQueue(ctx_->Device());
-      }
+      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)),
+      qu_(device_manager.GetQueue(context->Device())),
+      device_prop_(qu_->get_device()) {}
 
   void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
                     const gbm::GBTreeModel &model, bst_tree_t tree_begin,
@@ -254,7 +294,117 @@ class Predictor : public xgboost::Predictor {
   }
 
  private:
-  template <bool any_missing>
+  // 8KB fits EU registers
+  static constexpr int kMaxFeatureBufferSize = 2048;
+
+  // Relative cost of reading and writing for discrete and integrated devices.
+  static constexpr float kCostCalibrationIntegrated = 64;
+  static constexpr float kCostCalibrationDescrete = 4;
+
+  template <bool any_missing, int kFeatureBufferSize = 8>
+  void PredictKernelBufferDispatch(::sycl::event* event,
+                                   const Entry* data,
+                                   float* out_predictions,
+                                   const size_t* row_ptr,
+                                   size_t num_rows,
+                                   size_t num_features,
+                                   size_t num_group,
+                                   size_t tree_begin,
+                                   size_t tree_end,
+                                   float sparsity) const {
+    if constexpr (kFeatureBufferSize > kMaxFeatureBufferSize) {
+      LOG(FATAL) << "Unreachable";
+    } else {
+      if (num_features > kFeatureBufferSize) {
+        PredictKernelBufferDispatch<any_missing, 2 * kFeatureBufferSize>
+                                   (event, data, out_predictions, row_ptr, num_rows,
+                                    num_features, num_group, tree_begin, tree_end, sparsity);
+      } else {
+        PredictKernelBuffer<any_missing, kFeatureBufferSize>
+                           (event, data, out_predictions, row_ptr, num_rows,
+                            num_features, num_group, tree_begin, tree_end, sparsity);
+      }
+    }
+  }
+
+  size_t GetBlockSize(size_t n_nodes, size_t num_features, size_t num_rows, float sparsity) const {
+    size_t max_compute_units = device_prop_.max_compute_units;
+    size_t l2_size = device_prop_.l2_size;
+    size_t sub_group_size = device_prop_.sub_group_size;
+    size_t nodes_bytes = n_nodes * sizeof(Node);
+    bool nodes_fit_l2 = l2_size > 2 * nodes_bytes;
+    size_t block_size = nodes_fit_l2
+                      // nodes and data fit L2
+                      ? 0.8 * (l2_size - nodes_bytes) / (sparsity * num_features * sizeof(Entry))
+                      // only data fit L2
+                      : 0.8 * (l2_size) / (sparsity * num_features * sizeof(Entry));
+    block_size = (block_size / sub_group_size) * sub_group_size;
+    if (block_size < max_compute_units * sub_group_size) {
+      block_size = max_compute_units * sub_group_size;
+    }
+
+    if (block_size > num_rows) block_size = num_rows;
+    return block_size;
+  }
+
+  template <bool any_missing, int kFeatureBufferSize>
+  void PredictKernelBuffer(::sycl::event* event,
+                           const Entry* data,
+                           float* out_predictions,
+                           const size_t* row_ptr,
+                           size_t num_rows,
+                           size_t num_features,
+                           size_t num_group,
+                           size_t tree_begin,
+                           size_t tree_end,
+                           float sparsity) const {
+    const Node* nodes = device_model.nodes.DataConst();
+    const size_t* first_node_position = device_model.first_node_position.ConstDevicePointer();
+    const int* tree_group = device_model.tree_group.ConstDevicePointer();
+
+    size_t block_size = GetBlockSize(device_model.nodes.Size(),
+                                     num_features, num_rows, sparsity);
+    size_t n_blocks = num_rows / block_size + (num_rows % block_size > 0);
+
+    for (size_t block = 0; block < n_blocks; ++block) {
+      *event = qu_->submit([&](::sycl::handler& cgh) {
+        cgh.depends_on(*event);
+        cgh.parallel_for<>(::sycl::range<1>(block_size), [=](::sycl::id<1> pid) {
+          int row_idx = block * block_size + pid[0];
+          if (row_idx < num_rows) {
+            const Entry* first_entry = data + row_ptr[row_idx];
+            const Entry* last_entry = data + row_ptr[row_idx + 1];
+
+            float fvalues[kFeatureBufferSize];
+            if constexpr (any_missing) {
+              for (size_t fid = 0; fid < num_features; ++fid) {
+                fvalues[fid] = std::numeric_limits<float>::quiet_NaN();
+              }
+            }
+
+            for (const Entry* entry = first_entry; entry < last_entry; entry += 1) {
+              fvalues[entry->index] = entry->fvalue;
+            }
+            if (num_group == 1) {
+              float& sum = out_predictions[row_idx];
+              for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+                const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
+                sum += GetLeafWeight(first_node, fvalues);
+              }
+            } else {
+              for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+                const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
+                int out_prediction_idx = row_idx * num_group + tree_group[tree_idx];
+                out_predictions[out_prediction_idx] +=
+                    GetLeafWeight(first_node, fvalues);
+              }
+            }
+          }
+        });
+      });
+    }
+  }
+
   void PredictKernel(::sycl::event* event,
                      const Entry* data,
                      float* out_predictions,
@@ -263,58 +413,70 @@ class Predictor : public xgboost::Predictor {
                      size_t num_features,
                      size_t num_group,
                      size_t tree_begin,
-                     size_t tree_end) const {
+                     size_t tree_end,
+                     float sparsity) const {
     const Node* nodes = device_model.nodes.DataConst();
     const size_t* first_node_position = device_model.first_node_position.ConstDevicePointer();
     const int* tree_group = device_model.tree_group.ConstDevicePointer();
 
-    float* fval_buff_ptr = fval_buff.Data();
-    uint8_t* miss_buff_ptr = miss_buff.Data();
-    bool needs_buffer_update = this->needs_buffer_update;
-
-    *event = qu_->submit([&](::sycl::handler& cgh) {
-      cgh.depends_on(*event);
-      cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::id<1> pid) {
-        int row_idx = pid[0];
-        auto* fval_buff_row_ptr = fval_buff_ptr + num_features * row_idx;
-        auto* miss_buff_row_ptr = miss_buff_ptr + num_features * row_idx;
-
-        if (needs_buffer_update) {
-          const Entry* first_entry = data + row_ptr[row_idx];
-          const Entry* last_entry = data + row_ptr[row_idx + 1];
-          for (const Entry* entry = first_entry; entry < last_entry; entry += 1) {
-            fval_buff_row_ptr[entry->index] = entry->fvalue;
-            if constexpr (any_missing) {
-              miss_buff_row_ptr[entry->index] = 0;
-            }
-          }
-        }
-
-        if (num_group == 1) {
-          float& sum = out_predictions[row_idx];
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
-            if constexpr (any_missing) {
-              sum += GetLeafWeight(first_node, fval_buff_row_ptr, miss_buff_row_ptr);
+    size_t block_size = GetBlockSize(device_model.nodes.Size(),
+                                     num_features, num_rows, sparsity);
+    size_t n_blocks = num_rows / block_size + (num_rows % block_size > 0);
+
+    for (size_t block = 0; block < n_blocks; ++block) {
+      *event = qu_->submit([&](::sycl::handler& cgh) {
+        cgh.depends_on(*event);
+        cgh.parallel_for<>(::sycl::range<1>(block_size), [=](::sycl::id<1> pid) {
+          int row_idx = block * block_size + pid[0];
+          if (row_idx < num_rows) {
+            const Entry* first_entry = data + row_ptr[row_idx];
+            const Entry* last_entry = data + row_ptr[row_idx + 1];
+
+            if (num_group == 1) {
+              float& sum = out_predictions[row_idx];
+              for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+                const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
+                sum += GetLeafWeight(first_node, first_entry, last_entry, num_features);
+              }
             } else {
-              sum += GetLeafWeight(first_node, fval_buff_row_ptr);
+              for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+                const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
+                int out_prediction_idx = row_idx * num_group + tree_group[tree_idx];
+                out_predictions[out_prediction_idx] +=
+                    GetLeafWeight(first_node, first_entry, last_entry, num_features);
+              }
             }
           }
-        } else {
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
-            int out_prediction_idx = row_idx * num_group + tree_group[tree_idx];
-            if constexpr (any_missing) {
-              out_predictions[out_prediction_idx] +=
-                GetLeafWeight(first_node, fval_buff_row_ptr, miss_buff_row_ptr);
-            } else {
-              out_predictions[out_prediction_idx] +=
-                GetLeafWeight(first_node, fval_buff_row_ptr);
-            }
-          }
-        }
+        });
       });
-    });
+    }
+  }
+
+  template <bool any_missing>
+  bool UseFvalueBuffer(size_t tree_begin,
+                       size_t tree_end,
+                       int num_features) const {
+    size_t n_nodes = device_model.nodes.Size();
+    size_t n_trees = tree_end - tree_begin;
+    float av_depth = std::log2(static_cast<float>(n_nodes) / n_trees);
+    // the last one is leaf
+    float av_nodes_per_traversal = av_depth - 1;
+    // number of reads in case of no-bufer
+    float n_reads = av_nodes_per_traversal * n_trees;
+    if (any_missing) {
+      // we use binary search for sparse
+      n_reads *= std::log2(static_cast<float>(num_features));
+    }
+
+    float cost_callibration = device_prop_.usm_host_allocations
+                            ? kCostCalibrationIntegrated
+                            : kCostCalibrationDescrete;
+
+    // number of writes in local memory.
+    float n_writes = num_features;
+    bool use_fvalue_buffer = (num_features <= kMaxFeatureBufferSize) &&
+                             (n_reads > cost_callibration * n_writes);
+    return use_fvalue_buffer;
   }
 
   template <bool any_missing>
@@ -342,30 +504,28 @@ class Predictor : public xgboost::Predictor {
       if (batch_size > 0) {
         const auto base_rowid = batch.base_rowid;
 
-        if (needs_buffer_update) {
-          fval_buff.ResizeNoCopy(qu_, num_features * batch_size);
-          if constexpr (any_missing) {
-            miss_buff.ResizeAndFill(qu_, num_features * batch_size, 1, &event);
-          }
+        float sparsity = static_cast<float>(batch.data.Size()) / (batch_size * num_features);
+        if (UseFvalueBuffer<any_missing>(tree_begin, tree_end, num_features)) {
+          PredictKernelBufferDispatch<any_missing>(&event, data,
+                                                   out_predictions + base_rowid * num_group,
+                                                   row_ptr, batch_size, num_features,
+                                                   num_group, tree_begin, tree_end, sparsity);
+        } else {
+          PredictKernel(&event, data,
+                        out_predictions + base_rowid * num_group,
+                        row_ptr, batch_size, num_features,
+                        num_group, tree_begin, tree_end, sparsity);
         }
-
-        PredictKernel<any_missing>(&event, data, out_predictions + base_rowid,
-                                   row_ptr, batch_size, num_features,
-                                   num_group, tree_begin, tree_end);
-        needs_buffer_update = (batch_size != out_preds->Size());
       }
     }
     qu_->wait();
   }
 
-  mutable USMVector<float,   MemoryType::on_device> fval_buff;
-  mutable USMVector<uint8_t, MemoryType::on_device> miss_buff;
   mutable DeviceModel device_model;
-  mutable bool needs_buffer_update = true;
+  DeviceManager device_manager;
 
   mutable ::sycl::queue* qu_ = nullptr;
-
-  DeviceManager device_manager;
+  DeviceProperties device_prop_;
 
   std::unique_ptr<xgboost::Predictor> cpu_predictor;
 };
diff --git a/plugin/sycl/tree/hist_dispatcher.h b/plugin/sycl/tree/hist_dispatcher.h
index 5552a0799ae2..fe3874a90656 100644
--- a/plugin/sycl/tree/hist_dispatcher.h
+++ b/plugin/sycl/tree/hist_dispatcher.h
@@ -7,57 +7,13 @@
 
 #include <algorithm>
 #include <sycl/sycl.hpp>
-#include <sycl/ext/oneapi/experimental/device_architecture.hpp>
 
-#include "../../../src/common/common.h"               // for HumanMemUnit
+#include "../device_properties.h"
 
 namespace xgboost {
 namespace sycl {
 namespace tree {
 
-class DeviceProperties {
-  void GetL2Size(const ::sycl::device& device) {
-    size_t l2_size = device.get_info<::sycl::info::device::global_mem_cache_size>();
-    LOG(INFO) << "Detected L2 Size = " << ::xgboost::common::HumanMemUnit(l2_size);
-    l2_size_per_eu = static_cast<float>(l2_size) / max_compute_units;
-  }
-
-  void GetSRAMSize(const ::sycl::device& device) {
-    auto arch =
-      device.get_info<::sycl::ext::oneapi::experimental::info::device::architecture>();
-    size_t eu_per_core =
-      device.get_info<::sycl::ext::intel::info::device::gpu_eu_count_per_subslice>();
-    switch (arch) {
-      case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc: {
-        LOG(INFO) << "Xe-HPC (Ponte Vecchio) Architecture. L1 friendly optimization enabled.";
-        size_t l1_size = 512 * 1024;
-        size_t registers_size = 64 * 1024;
-        sram_size_per_eu = l1_size  / eu_per_core + registers_size;
-        break;
-      }
-      default:
-        sram_size_per_eu = 0;
-    }
-  }
-
- public:
-  bool is_gpu;
-  size_t max_compute_units;
-  size_t max_work_group_size;
-  float sram_size_per_eu = 0;
-  float l2_size_per_eu = 0;
-
-  explicit DeviceProperties(const ::sycl::device& device):
-    is_gpu(device.is_gpu()),
-    max_compute_units(device.get_info<::sycl::info::device::max_compute_units>()),
-    max_work_group_size(device.get_info<::sycl::info::device::max_work_group_size>()) {
-      GetL2Size(device);
-      if (is_gpu) {
-        GetSRAMSize(device);
-      }
-    }
-};
-
 struct BlockParams { size_t size, nblocks; };
 
 template <typename FPType>
diff --git a/tests/cpp/plugin/test_sycl_ghist_builder.cc b/tests/cpp/plugin/test_sycl_ghist_builder.cc
index abf48c4c8cdc..95e38a61adf5 100644
--- a/tests/cpp/plugin/test_sycl_ghist_builder.cc
+++ b/tests/cpp/plugin/test_sycl_ghist_builder.cc
@@ -68,7 +68,7 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) {
   InitHist(qu, &hist, hist.Size(), &event);
   InitHist(qu, &hist_buffer, hist_buffer.Size(), &event);
 
-  tree::DeviceProperties device_prop(qu->get_device());
+  DeviceProperties device_prop(qu->get_device());
   event = builder.BuildHist(gpair, row_set_collection[0], gmat_sycl, &hist,
                             sparsity < eps , &hist_buffer, device_prop, event, force_atomic_use);
   qu->memcpy(hist_host.data(), hist.Data(),

From e946a4be8c9c265f495b7dd175c9da704939cefe Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 8 Aug 2025 06:28:12 +0800
Subject: [PATCH 133/224] Fix CPU inplace predict performance regression.
 (#11621)

---
 src/predictor/cpu_predictor.cc | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 874b98a7af38..316ef091c892 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -820,23 +820,16 @@ class CPUPredictor : public Predictor {
           CHECK_EQ(p_m->Info().num_col_, x->NumColumns());
           this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
 
-          bool blocked = ShouldUseBlock(p_m.get());
-
           auto &predictions = out_preds->predictions.HostVector();
           std::vector<RegTree::FVec> thread_temp;
-          InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &thread_temp);
+          InitThreadTemp(n_threads * kBlockOfRowsSize, &thread_temp);
           bst_idx_t n_groups = model.learner_model_param->OutputLength();
           auto out_predt = linalg::MakeTensorView(ctx_, predictions, x->NumRows(), n_groups);
 
           auto launch = [&](auto &&acc) {
             auto view = AdapterView{x.get(), missing, acc};
-            if (blocked) {
-              PredictBatchByBlockKernel<kBlockOfRowsSize>(view, model, tree_begin, tree_end,
-                                                          &thread_temp, n_threads, out_predt);
-            } else {
-              PredictBatchByBlockKernel<1>(view, model, tree_begin, tree_end, &thread_temp,
-                                           n_threads, out_predt);
-            }
+            PredictBatchByBlockKernel<kBlockOfRowsSize>(view, model, tree_begin, tree_end,
+                                                        &thread_temp, n_threads, out_predt);
           };
 
           if constexpr (std::is_same_v<AdapterT, data::ColumnarAdapter>) {

From 0d47374c8e1abd7ac2112641b3bdcb9d143bc748 Mon Sep 17 00:00:00 2001
From: Manthan <pattedamanthan@gmail.com>
Date: Sun, 10 Aug 2025 21:37:31 +0530
Subject: [PATCH 134/224] [doc] Updated Machine Learning Challenge Winning
 Solutions. (#11626)

---
 demo/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/demo/README.md b/demo/README.md
index b0c644b0c802..c017c434543c 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -73,6 +73,7 @@ XGBoost is extensively used by machine learning practitioners to create state of
 this is a list of machine learning winning solutions with XGBoost.
 Please send pull requests if you find ones that are missing here.
 
+- Gábor Melis, 1st place winner of [Kaggle Higgs competition](https://github.com/ghl3/higgs-kaggle) conducted between May and September 2014. Link to [discussion](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/), [code](https://github.com/phunterlau/kaggle_higgs) and [news article](https://atlas.cern/updates/news/machine-learning-wins-higgs-challenge)
 - Bishwarup Bhattacharjee, 1st place winner of [Allstate Claims Severity](https://www.kaggle.com/competitions/allstate-claims-severity/overview) conducted on December 2016. Link to [discussion](https://www.kaggle.com/competitions/allstate-claims-severity/discussion/26416)
 - Benedikt Schifferer, Gilberto Titericz, Chris Deotte, Christof Henkel, Kazuki Onodera, Jiwei Liu, Bojan Tunguz, Even Oldridge, Gabriel De Souza Pereira Moreira and Ahmet Erdem, 1st place winner of [Twitter RecSys Challenge 2020](https://recsys-twitter.com/) conducted from June,20-August,20. [GPU Accelerated Feature Engineering and Training for Recommender Systems](https://medium.com/rapids-ai/winning-solution-of-recsys2020-challenge-gpu-accelerated-feature-engineering-and-training-for-cd67c5a87b1f)
 - Eugene Khvedchenya,Jessica Fridrich, Jan Butora, Yassine Yousfi 1st place winner in [ALASKA2 Image Steganalysis](https://www.kaggle.com/c/alaska2-image-steganalysis/overview). Link to [discussion](https://www.kaggle.com/c/alaska2-image-steganalysis/discussion/168546)

From 9311c65d442a9ef19e7a3ff4b7e7165704fa43dd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 12 Aug 2025 11:35:40 +0800
Subject: [PATCH 135/224] Support leaf prediction with QDM on CPU. (#11620)

---
 python-package/xgboost/testing/dask.py    |  10 +-
 python-package/xgboost/testing/predict.py |  65 +++
 src/common/threading_utils.h              |  11 +
 src/data/ellpack_page.cu                  |   4 +
 src/data/gradient_index.h                 |   2 +-
 src/data/proxy_dmatrix.h                  |   1 +
 src/predictor/cpu_predictor.cc            | 524 ++++++++++++----------
 src/predictor/gpu_predictor.cu            |  66 ++-
 src/predictor/utils.h                     |  26 ++
 tests/python-gpu/test_gpu_prediction.py   |  37 +-
 tests/python/test_predict.py              |  61 +--
 11 files changed, 459 insertions(+), 348 deletions(-)
 create mode 100644 python-package/xgboost/testing/predict.py
 create mode 100644 src/predictor/utils.h

diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index 5f14c90b4ae3..606b05def171 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -322,16 +322,18 @@ def pack(**kwargs: Any) -> dd.DataFrame:
     return X, y
 
 
+# pylint: disable=too-many-locals
 def run_recode(client: Client, device: Device) -> None:
     """Run re-coding test with the Dask interface."""
     enc, reenc, y, _, _ = make_recoded(device, n_features=96)
+    workers = get_client_workers(client)
     denc, dreenc, dy = (
-        dd.from_pandas(enc, npartitions=8),
-        dd.from_pandas(reenc, npartitions=8),
-        da.from_array(y, chunks=(y.shape[0] // 8,)),
+        dd.from_pandas(enc, npartitions=8).persist(workers=workers),
+        dd.from_pandas(reenc, npartitions=8).persist(workers=workers),
+        da.from_array(y, chunks=(y.shape[0] // 8,)).persist(workers=workers),
     )
+
     wait([denc, dreenc, dy])
-    client.rebalance([denc, dreenc, dy])
 
     if device == "cuda":
         denc = denc.to_backend("cudf")
diff --git a/python-package/xgboost/testing/predict.py b/python-package/xgboost/testing/predict.py
new file mode 100644
index 000000000000..d75164bba00e
--- /dev/null
+++ b/python-package/xgboost/testing/predict.py
@@ -0,0 +1,65 @@
+"""Tests for inference."""
+
+from typing import Type
+
+import numpy as np
+
+from ..core import DMatrix
+from ..training import train
+from .shared import validate_leaf_output
+from .utils import Device
+
+
+# pylint: disable=invalid-name,too-many-locals
+def run_predict_leaf(device: Device, DMatrixT: Type[DMatrix]) -> np.ndarray:
+    """Run tests for leaf index prediction."""
+    rows = 100
+    cols = 4
+    classes = 5
+    num_parallel_tree = 4
+    num_boost_round = 10
+    rng = np.random.RandomState(1994)
+    X = rng.randn(rows, cols)
+    y = rng.randint(low=0, high=classes, size=rows)
+
+    m = DMatrixT(X, y)
+    booster = train(
+        {
+            "num_parallel_tree": num_parallel_tree,
+            "num_class": classes,
+            "tree_method": "hist",
+        },
+        m,
+        num_boost_round=num_boost_round,
+    )
+
+    booster.set_param({"device": device})
+    empty = DMatrixT(np.ones(shape=(0, cols)))
+    empty_leaf = booster.predict(empty, pred_leaf=True)
+    assert empty_leaf.shape[0] == 0
+
+    leaf = booster.predict(m, pred_leaf=True, strict_shape=True)
+    assert leaf.shape[0] == rows
+    assert leaf.shape[1] == num_boost_round
+    assert leaf.shape[2] == classes
+    assert leaf.shape[3] == num_parallel_tree
+
+    validate_leaf_output(leaf, num_parallel_tree)
+
+    n_iters = np.int32(2)
+    sliced = booster.predict(
+        m,
+        pred_leaf=True,
+        iteration_range=(0, n_iters),
+        strict_shape=True,
+    )
+    first = sliced[0, ...]
+
+    assert np.prod(first.shape) == classes * num_parallel_tree * n_iters
+
+    # When there's only 1 tree, the output is a 1 dim vector
+    booster = train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
+    booster.set_param({"device": device})
+    assert booster.predict(m, pred_leaf=True).shape == (rows,)
+
+    return leaf
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 8288cbf04449..4bbd280a21ee 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -251,6 +251,17 @@ void ParallelFor(Index size, std::int32_t n_threads, Func&& fn) {
   ParallelFor(size, n_threads, Sched::Static(), std::forward<Func>(fn));
 }
 
+/**
+ * @brief 1-d block-based parallel for loop.
+ *
+ * @tparam kBlockOfRowsSize The size of the block.
+ * @tparam Index The type of the index.
+ * @tparam Func The type of the function.
+ *
+ * @param size The size of the range.
+ * @param n_threads The number of threads.
+ * @param fn The function to execute. The function should take a Range1d as an argument.
+ */
 template <std::size_t kBlockOfRowsSize, typename Index, typename Func>
 void ParallelFor1d(Index size, std::int32_t n_threads, Func&& fn) {
   static_assert(std::is_void_v<std::invoke_result_t<Func, common::Range1d>>);
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index bd1e6f0cc6b8..2de63bb626c7 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -462,6 +462,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
       info{CalcNumSymbols(
           ctx,
           [&] {
+            if (page.Size() == 0) {
+              return static_cast<typename decltype(page.row_ptr)::value_type>(0);
+            }
+            CHECK_GE(page.row_ptr.size(), 2);
             auto it = common::MakeIndexTransformIter(
                 [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
             return *std::max_element(it, it + page.Size());
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 6560198093c1..f1e1350ae648 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -20,7 +20,7 @@
 #include "../common/ref_resource_view.h"  // for RefResourceView
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
-#include "adapter.h"
+#include "entry.h"                         // for IsValidFunctor
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index b2cf1e55dbf3..b2ea9a2d5afa 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -158,6 +158,7 @@ struct ExternalDataInfo {
 
     CHECK_GE(this->n_features, 1) << "Data must has at least 1 column.";
     CHECK_EQ(this->base_rowids.size(), this->n_batches + 1);
+    CHECK_LE(this->row_stride, this->n_features);
   }
 
   void SetInfo(Context const* ctx, bool sync, MetaInfo* p_info) {
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 316ef091c892..93c9fffea031 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -24,6 +24,7 @@
 #include "dmlc/registry.h"                    // for DMLC_REGISTRY_FILE_TAG
 #include "predict_fn.h"                       // for GetNextNode, GetNextNodeMulti
 #include "treeshap.h"                         // for CalculateContributions
+#include "utils.h"                            // for CheckProxyDMatrix
 #include "xgboost/base.h"                     // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
 #include "xgboost/context.h"                  // for Context
 #include "xgboost/data.h"                     // for Entry, DMatrix, MetaInfo, SparsePage, Batch...
@@ -133,28 +134,17 @@ void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree
   }
 }
 
-template <typename DataView>
-void FVecFill(std::size_t const block_size, std::size_t const batch_offset,
-              bst_feature_t n_features, DataView *p_batch,
-              common::Span<RegTree::FVec> s_feats_vec) {
-  auto feats_vec = s_feats_vec.data();
-  // auto &feats_vec = *p_feats;
-  auto &batch = *p_batch;
-  for (std::size_t i = 0; i < block_size; ++i) {
-    RegTree::FVec &feats = feats_vec[i];
-    if (feats.Size() == 0) {
-      feats.Init(n_features);
-    }
-    batch.Fill(batch_offset + i, &feats);
-  }
+bool ShouldUseBlock(DMatrix *p_fmat) {
+  // Threshold to use block-based prediction.
+  constexpr double kDensityThresh = .5;
+  bst_idx_t n_samples = p_fmat->Info().num_row_;
+  bst_idx_t total = std::max(n_samples * p_fmat->Info().num_col_, static_cast<bst_idx_t>(1));
+  double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
+  bool blocked = density > kDensityThresh;
+  return blocked;
 }
 
-void FVecDrop(std::size_t const block_size, common::Span<RegTree::FVec> s_feats) {
-  auto p_feats = s_feats.data();
-  for (size_t i = 0; i < block_size; ++i) {
-    p_feats[i].Drop();
-  }
-}
+using cpu_impl::MakeCatAccessor;
 
 // Convert a single sample in batch view to FVec
 template <typename BatchView>
@@ -164,18 +154,38 @@ struct DataToFeatVec {
     auto n_valid = static_cast<BatchView const *>(this)->DoFill(ridx, feats.Data().data());
     feats.HasMissing(n_valid != feats.Size());
   }
+
+  // Fill the data into the feature vector.
+  void FVecFill(common::Range1d const &block, bst_feature_t n_features,
+                common::Span<RegTree::FVec> s_feats_vec) const {
+    auto feats_vec = s_feats_vec.data();
+    for (std::size_t i = 0; i < block.Size(); ++i) {
+      RegTree::FVec &feats = feats_vec[i];
+      if (feats.Size() == 0) {
+        feats.Init(n_features);
+      }
+      this->Fill(block.begin() + i, &feats);
+    }
+  }
+  // Clear the feature vector.
+  static void FVecDrop(common::Span<RegTree::FVec> s_feats) {
+    auto p_feats = s_feats.data();
+    for (size_t i = 0, n = s_feats.size(); i < n; ++i) {
+      p_feats[i].Drop();
+    }
+  }
 };
 
 template <typename EncAccessor>
 class SparsePageView : public DataToFeatVec<SparsePageView<EncAccessor>> {
-  EncAccessor acc_;
+  EncAccessor const &acc_;
   HostSparsePageView const view_;
 
  public:
   bst_idx_t const base_rowid;
 
-  SparsePageView(HostSparsePageView const p, bst_idx_t base_rowid, EncAccessor &&acc)
-      : acc_{std::forward<EncAccessor>(acc)}, view_{p}, base_rowid{base_rowid} {}
+  SparsePageView(HostSparsePageView const p, bst_idx_t base_rowid, EncAccessor const &acc)
+      : acc_{acc}, view_{p}, base_rowid{base_rowid} {}
   [[nodiscard]] std::size_t Size() const { return view_.Size(); }
 
   [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float *out) const {
@@ -194,7 +204,7 @@ template <typename EncAccessor>
 class GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView<EncAccessor>> {
  private:
   GHistIndexMatrix const &page_;
-  EncAccessor acc_;
+  EncAccessor const &acc_;
   common::Span<FeatureType const> ft_;
 
   std::vector<std::uint32_t> const &ptrs_;
@@ -206,7 +216,7 @@ class GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView<EncAccess
   bst_idx_t const base_rowid;
 
  public:
-  GHistIndexMatrixView(GHistIndexMatrix const &_page, EncAccessor &&acc,
+  GHistIndexMatrixView(GHistIndexMatrix const &_page, EncAccessor const &acc,
                        common::Span<FeatureType const> ft)
       : page_{_page},
         acc_{acc},
@@ -225,8 +235,8 @@ class GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView<EncAccess
     if (page_.IsDense()) {
       common::DispatchBinType(page_.index.GetBinTypeSize(), [&](auto t) {
         using T = decltype(t);
-        auto ptr = page_.index.data<T>();
-        auto rbeg = page_.row_ptr[ridx];
+        auto ptr = this->page_.index.template data<T>();
+        auto rbeg = this->page_.row_ptr[ridx];
         for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
           bst_bin_t bin_idx;
           float fvalue;
@@ -302,24 +312,165 @@ class AdapterView : public DataToFeatVec<AdapterView<Adapter, EncAccessor>> {
   bst_idx_t const static base_rowid = 0;  // NOLINT
 };
 
+// Ordinal re-coder.
+struct EncAccessorPolicy {
+ private:
+  std::vector<int32_t> mapping_;
+
+ public:
+  EncAccessorPolicy() = default;
+
+  EncAccessorPolicy &operator=(EncAccessorPolicy const &that) = delete;
+  EncAccessorPolicy(EncAccessorPolicy const &that) = delete;
+
+  EncAccessorPolicy &operator=(EncAccessorPolicy &&that) = default;
+  EncAccessorPolicy(EncAccessorPolicy &&that) = default;
+
+  [[nodiscard]] auto MakeAccessor(Context const *ctx, enc::HostColumnsView new_enc,
+                                  gbm::GBTreeModel const &model) {
+    auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.Cats());
+    this->mapping_ = std::move(mapping);
+    return acc;
+  }
+};
+
+struct NullEncAccessorPolicy {
+  template <typename... Args>
+  [[nodiscard]] auto MakeAccessor(Args &&...) const {
+    return NoOpAccessor{};
+  }
+};
+
+// Block-based parallel.
+struct BlockPolicy {
+  constexpr static std::size_t kBlockOfRowsSize = 64;
+};
+
+struct NullBlockPolicy {
+  constexpr static std::size_t kBlockOfRowsSize = 1;
+};
+
+/**
+ * @brief Policy class, requires a block policy and an accessor policy.
+ */
+template <typename... Args>
+struct LaunchConfig : public Args... {
+  Context const *ctx;
+  DMatrix *p_fmat;
+  gbm::GBTreeModel const &model;
+
+  LaunchConfig(Context const *ctx, DMatrix *p_fmat, gbm::GBTreeModel const &model)
+      : ctx{ctx}, p_fmat{p_fmat}, model{model} {}
+
+  LaunchConfig(LaunchConfig const &that) = delete;
+  LaunchConfig &operator=(LaunchConfig const &that) = delete;
+  LaunchConfig(LaunchConfig &&that) = default;
+  LaunchConfig &operator=(LaunchConfig &&that) = default;
+
+  // Helper for running prediction with DMatrix inputs.
+  template <typename Fn>
+  void ForEachBatch(Fn &&fn) {
+    auto acc = this->MakeAccessor(ctx, p_fmat->Cats()->HostView(), model);
+
+    if (!p_fmat->PageExists<SparsePage>()) {
+      auto ft = p_fmat->Info().feature_types.ConstHostVector();
+      for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx, {})) {
+        fn(GHistIndexMatrixView{page, acc, ft});
+      }
+    } else {
+      for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
+        fn(SparsePageView{page.GetView(), page.base_rowid, acc});
+      }
+    }
+  }
+};
+
+/**
+ * @brief Dispatch for the prediction function.
+ *
+ * @tparam Fn         A function that accepts a @ref LaunchConfig object.
+ * @tparam NeedRecode Given a DMatrix input, returns whether we need to recode the categorical
+ *                    features.
+ */
+template <typename Fn, typename NeedRecode>
+void LaunchPredict(Context const *ctx, DMatrix *p_fmat, gbm::GBTreeModel const &model, Fn &&fn,
+                   NeedRecode &&need_recode) {
+  bool blocked = ShouldUseBlock(p_fmat);
+
+  if (blocked) {
+    if (model.Cats()->HasCategorical() && need_recode(p_fmat)) {
+      using Policy = LaunchConfig<BlockPolicy, EncAccessorPolicy>;
+      fn(Policy{ctx, p_fmat, model});
+    } else {
+      using Policy = LaunchConfig<BlockPolicy, NullEncAccessorPolicy>;
+      fn(Policy{ctx, p_fmat, model});
+    }
+  } else {
+    if (model.Cats()->HasCategorical() && need_recode(p_fmat)) {
+      using Policy = LaunchConfig<NullBlockPolicy, EncAccessorPolicy>;
+      fn(Policy{ctx, p_fmat, model});
+    } else {
+      using Policy = LaunchConfig<NullBlockPolicy, NullEncAccessorPolicy>;
+      fn(Policy{ctx, p_fmat, model});
+    }
+  }
+}
+
+template <typename Fn>
+void LaunchPredict(Context const *ctx, DMatrix *p_fmat, gbm::GBTreeModel const &model, Fn &&fn) {
+  LaunchPredict(ctx, p_fmat, model, fn,
+                [](DMatrix const *p_fmat) { return p_fmat->Cats()->NeedRecode(); });
+}
+
+/**
+ * @brief Thread-local buffer for the feature matrix.
+ */
+template <std::size_t kBlockOfRowsSize>
+class ThreadTmp {
+ private:
+  std::vector<RegTree::FVec> feat_vecs_;
+
+ public:
+  /**
+   * @param blocked Whether block-based parallelism is used.
+   */
+  explicit ThreadTmp(std::int32_t n_threads) {
+    std::size_t n = n_threads * kBlockOfRowsSize;
+    std::size_t prev_thread_temp_size = feat_vecs_.size();
+    if (prev_thread_temp_size < n) {
+      feat_vecs_.resize(n, RegTree::FVec{});
+    }
+  }
+  /**
+   * @brief Get a thread local buffer.
+   *
+   * @param n The size of the thread local block.
+   */
+  common::Span<RegTree::FVec> ThreadBuffer(std::size_t n) {
+    std::int32_t thread_idx = omp_get_thread_num();
+    auto const fvec_offset = thread_idx * kBlockOfRowsSize;
+    auto fvec_tloc = common::Span{feat_vecs_}.subspan(fvec_offset, n);
+    return fvec_tloc;
+  }
+};
+
 template <std::size_t kBlockOfRowsSize, typename DataView>
 void PredictBatchByBlockKernel(DataView const &batch, gbm::GBTreeModel const &model,
                                bst_tree_t tree_begin, bst_tree_t tree_end,
-                               std::vector<RegTree::FVec> *p_thread_temp, std::int32_t n_threads,
+                               ThreadTmp<kBlockOfRowsSize> *p_fvec, std::int32_t n_threads,
                                linalg::TensorView<float, 2> out_predt) {
-  auto fvec = common::Span{*p_thread_temp};
+  auto &fvec = *p_fvec;
   // Parallel over local batches
   auto const n_samples = batch.Size();
   auto const n_features = model.learner_model_param->num_feature;
 
   common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads, [&](auto &&block) {
-    auto const fvec_offset = omp_get_thread_num() * kBlockOfRowsSize;
-    auto fvec_tloc = fvec.subspan(fvec_offset, block.Size());
+    auto fvec_tloc = fvec.ThreadBuffer(block.Size());
 
-    FVecFill(block.Size(), block.begin(), n_features, &batch, fvec_tloc);
+    batch.FVecFill(block, n_features, fvec_tloc);
     PredictBlockByAllTrees(model, tree_begin, tree_end, block.begin() + batch.base_rowid, fvec_tloc,
                            block.Size(), out_predt);
-    FVecDrop(block.Size(), fvec_tloc);
+    batch.FVecDrop(fvec_tloc);
   });
 }
 
@@ -348,26 +499,6 @@ void FillNodeMeanValues(RegTree const *tree, std::vector<float> *mean_values) {
   mean_values->resize(n_nodes);
   FillNodeMeanValues(tree, 0, mean_values);
 }
-
-// init thread buffers
-static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
-  int prev_thread_temp_size = out->size();
-  if (prev_thread_temp_size < nthread) {
-    out->resize(nthread, RegTree::FVec());
-  }
-}
-
-using cpu_impl::MakeCatAccessor;
-
-bool ShouldUseBlock(DMatrix *p_fmat) {
-  // Threshold to use block-based prediction.
-  constexpr double kDensityThresh = .5;
-  bst_idx_t n_samples = p_fmat->Info().num_row_;
-  bst_idx_t total = std::max(n_samples * p_fmat->Info().num_col_, static_cast<bst_idx_t>(1));
-  double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
-  bool blocked = density > kDensityThresh;
-  return blocked;
-}
 }  // anonymous namespace
 
 /**
@@ -399,10 +530,17 @@ class ColumnSplitHelper {
  public:
   ColumnSplitHelper(std::int32_t n_threads, gbm::GBTreeModel const &model, bst_tree_t tree_begin,
                     bst_tree_t tree_end)
-      : n_threads_{n_threads}, model_{model}, tree_begin_{tree_begin}, tree_end_{tree_end} {
+      : n_threads_{n_threads},
+        model_{model},
+        tree_begin_{tree_begin},
+        tree_end_{tree_end},
+        feat_vecs_{n_threads} {
     CHECK(!model.learner_model_param->IsVectorLeaf())
         << "Predict DMatrix with column split" << MTNotImplemented();
-    CHECK(!model.Cats()->HasCategorical()) << "The re-coder doesn't support column split yet.";
+    CHECK(!model.Cats()->HasCategorical())
+        << "Categorical feature is not yet supported with column-split.";
+    CHECK(xgboost::collective::IsDistributed())
+        << "column-split prediction is only supported for distributed training";
 
     auto const n_trees = tree_end_ - tree_begin_;
     tree_sizes_.resize(n_trees);
@@ -418,8 +556,6 @@ class ColumnSplitHelper {
     }
     // Add the size of the last tree since this is exclusive_scan
     bits_per_row_ = tree_offsets_.back() + tree_sizes_.back();
-
-    InitThreadTemp(n_threads_ * kBlockOfRowsSize, &feat_vecs_);
   }
 
   // Disable copy (and move) semantics.
@@ -429,15 +565,9 @@ class ColumnSplitHelper {
   ColumnSplitHelper &operator=(ColumnSplitHelper &&) noexcept = delete;
 
   void PredictDMatrix(Context const *ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
-    CHECK(xgboost::collective::IsDistributed())
-        << "column-split prediction is only supported for distributed training";
-    if (this->model_.Cats()->HasCategorical()) {
-      LOG(FATAL) << "Categorical feature is not yet supported with column-split.";
-    }
     if (!p_fmat->PageExists<SparsePage>()) {
       LOG(FATAL) << "Predict with `QuantileDMatrix` is not supported with column-split.";
     }
-
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(),
                p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
@@ -447,9 +577,6 @@ class ColumnSplitHelper {
   }
 
   void PredictLeaf(Context const *ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
-    CHECK(xgboost::collective::IsDistributed())
-        << "column-split prediction is only supported for distributed training";
-
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * (tree_end_ - tree_begin_));
       PredictBatchKernel<kBlockOfRowsSize, true>(
@@ -584,15 +711,13 @@ class ColumnSplitHelper {
     auto const n_features = model_.learner_model_param->num_feature;
 
     InitBitVectors(n_samples);
-    auto fvec = common::Span{feat_vecs_};
 
     common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads_, [&](auto &&block) {
-      auto const fvec_offset = omp_get_thread_num() * block_of_rows_size;
-      auto fvec_tloc = fvec.subspan(fvec_offset, block.Size());
+      auto fvec_tloc = feat_vecs_.ThreadBuffer(block.Size());
 
-      FVecFill(block.Size(), block.begin(), n_features, &batch, fvec_tloc);
+      batch.FVecFill(block, n_features, fvec_tloc);
       MaskAllTrees(block.begin(), fvec_tloc, block.Size());
-      FVecDrop(block.Size(), fvec_tloc);
+      batch.FVecDrop(fvec_tloc);
     });
 
     AllreduceBitVectors(ctx);
@@ -605,7 +730,7 @@ class ColumnSplitHelper {
     ClearBitVectors();
   }
 
-  static std::size_t constexpr kBlockOfRowsSize = 64;
+  static std::size_t constexpr kBlockOfRowsSize = BlockPolicy::kBlockOfRowsSize;
 
   std::int32_t const n_threads_;
   gbm::GBTreeModel const &model_;
@@ -615,7 +740,7 @@ class ColumnSplitHelper {
   std::vector<std::size_t> tree_sizes_{};
   std::vector<std::size_t> tree_offsets_{};
   std::size_t bits_per_row_{};
-  std::vector<RegTree::FVec> feat_vecs_{};
+  ThreadTmp<kBlockOfRowsSize> feat_vecs_;
 
   std::size_t n_rows_;
   /**
@@ -665,10 +790,6 @@ class CPUPredictor : public Predictor {
   void PredictDMatrix(DMatrix *p_fmat, std::vector<float> *out_preds, gbm::GBTreeModel const &model,
                       bst_tree_t tree_begin, bst_tree_t tree_end) const {
     if (p_fmat->Info().IsColumnSplit()) {
-      CHECK(!model.learner_model_param->IsVectorLeaf())
-          << "Predict DMatrix with column split" << MTNotImplemented();
-      CHECK(!model.Cats()->HasCategorical()) << "The re-coder doesn't support column split yet.";
-
       ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
       helper.PredictDMatrix(ctx_, p_fmat, out_preds);
       return;
@@ -676,62 +797,30 @@ class CPUPredictor : public Predictor {
 
     auto const n_threads = this->ctx_->Threads();
 
-    bool blocked = ShouldUseBlock(p_fmat);
-
-    std::vector<RegTree::FVec> feat_vecs;
-    InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
-
     // Create a writable view on the output prediction vector.
     bst_idx_t n_groups = model.learner_model_param->OutputLength();
     bst_idx_t n_samples = p_fmat->Info().num_row_;
     CHECK_EQ(out_preds->size(), n_samples * n_groups);
     auto out_predt = linalg::MakeTensorView(ctx_, *out_preds, n_samples, n_groups);
 
-    // Dispatching function for various configuration.
-    auto launch = [&](auto &&acc) {
-      using Enc = std::remove_reference_t<decltype(acc)>;  // The encoder.
-      if (!p_fmat->PageExists<SparsePage>()) {
-        // Run prediction on QDM.
-        auto ft = p_fmat->Info().feature_types.ConstHostVector();
-        for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
-          auto batch = GHistIndexMatrixView{page, std::forward<Enc>(acc), ft};
-          if (blocked) {
-            PredictBatchByBlockKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
-                                                        &feat_vecs, n_threads, out_predt);
-          } else {
-            PredictBatchByBlockKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs, n_threads,
-                                         out_predt);
-          }
-        }
-      } else {
-        // Run prediction on SparsePage
-        for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
-          auto batch = SparsePageView{page.GetView(), page.base_rowid, std::forward<Enc>(acc)};
-          if (blocked) {
-            PredictBatchByBlockKernel<kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
-                                                        &feat_vecs, n_threads, out_predt);
-          } else {
-            PredictBatchByBlockKernel<1>(batch, model, tree_begin, tree_end, &feat_vecs, n_threads,
-                                         out_predt);
-          }
-        }
-      }
-    };
-
-    if (model.Cats()->HasCategorical() && p_fmat->Cats()->NeedRecode()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
-      launch(acc);
-    } else {
-      launch(NoOpAccessor{});
-    }
+    LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
+      using Policy = common::GetValueT<decltype(policy)>;
+      ThreadTmp<Policy::kBlockOfRowsSize> feat_vecs{n_threads};
+      policy.ForEachBatch([&](auto &&batch) {
+        PredictBatchByBlockKernel<Policy::kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
+                                                            &feat_vecs, n_threads, out_predt);
+      });
+    });
   }
 
   template <typename DataView>
-  void PredictContributionKernel(
-      DataView batch, const MetaInfo &info, const gbm::GBTreeModel &model,
-      const std::vector<bst_float> *tree_weights, std::vector<std::vector<float>> *mean_values,
-      std::vector<RegTree::FVec> *feat_vecs, std::vector<bst_float> *contribs,
-      bst_tree_t ntree_limit, bool approximate, int condition, unsigned condition_feature) const {
+  void PredictContributionKernel(DataView batch, const MetaInfo &info,
+                                 const gbm::GBTreeModel &model,
+                                 const std::vector<bst_float> *tree_weights,
+                                 std::vector<std::vector<float>> *mean_values,
+                                 ThreadTmp<1> *feat_vecs, std::vector<bst_float> *contribs,
+                                 bst_tree_t ntree_limit, bool approximate, int condition,
+                                 unsigned condition_feature) const {
     const int num_feature = model.learner_model_param->num_feature;
     const int ngroup = model.learner_model_param->num_output_group;
     CHECK_NE(ngroup, 0);
@@ -744,7 +833,7 @@ class CPUPredictor : public Predictor {
     // parallel over local batch
     common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
       auto row_idx = batch.base_rowid + i;
-      RegTree::FVec &feats = (*feat_vecs)[omp_get_thread_num()];
+      RegTree::FVec &feats = feat_vecs->ThreadBuffer(1).front();
       if (feats.Size() == 0) {
         feats.Init(num_feature);
       }
@@ -803,45 +892,46 @@ class CPUPredictor : public Predictor {
                                     bst_tree_t tree_begin, bst_tree_t tree_end) const override {
     auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
     CHECK(proxy) << error::InplacePredictProxy();
-    CHECK(!p_m->Info().IsColumnSplit())
-        << "Inplace predict support for column-wise data split is not yet implemented.";
-    bool type_error = false;
-    data::cpu_impl::DispatchAny<false>(
-        proxy,
-        [&](auto x) {
-          using AdapterT = typename decltype(x)::element_type;
-
-          auto const n_threads = this->ctx_->Threads();
-
-          CHECK_EQ(x->NumColumns(), model.learner_model_param->num_feature)
-              << "Number of columns in data must equal to trained model.";
-          CHECK(p_m);
-          CHECK_EQ(p_m->Info().num_row_, x->NumRows());
-          CHECK_EQ(p_m->Info().num_col_, x->NumColumns());
-          this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
-
-          auto &predictions = out_preds->predictions.HostVector();
-          std::vector<RegTree::FVec> thread_temp;
-          InitThreadTemp(n_threads * kBlockOfRowsSize, &thread_temp);
-          bst_idx_t n_groups = model.learner_model_param->OutputLength();
-          auto out_predt = linalg::MakeTensorView(ctx_, predictions, x->NumRows(), n_groups);
-
-          auto launch = [&](auto &&acc) {
-            auto view = AdapterView{x.get(), missing, acc};
-            PredictBatchByBlockKernel<kBlockOfRowsSize>(view, model, tree_begin, tree_end,
-                                                        &thread_temp, n_threads, out_predt);
-          };
-
-          if constexpr (std::is_same_v<AdapterT, data::ColumnarAdapter>) {
-            // Make specialization for DataFrame where we need encoding.
-            if (model.Cats()->HasCategorical() && !x->Cats().Empty()) {
-              auto [acc, mapping] = MakeCatAccessor(ctx_, x->Cats(), model.Cats());
-              return launch(acc);
+
+    this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
+    auto &predictions = out_preds->predictions.HostVector();
+
+    auto const n_threads = this->ctx_->Threads();
+    // Always use block as we don't know the nnz.
+    ThreadTmp<BlockPolicy::kBlockOfRowsSize> feat_vecs{n_threads};
+    bst_idx_t n_groups = model.learner_model_param->OutputLength();
+
+    auto kernel = [&](auto &&view) {
+      auto out_predt = linalg::MakeTensorView(ctx_, predictions, view.Size(), n_groups);
+      PredictBatchByBlockKernel<BlockPolicy::kBlockOfRowsSize>(view, model, tree_begin, tree_end,
+                                                               &feat_vecs, n_threads, out_predt);
+    };
+    auto dispatch = [&](auto x) {
+      using AdapterT = typename decltype(x)::element_type;
+      CheckProxyDMatrix(x, proxy, model.learner_model_param);
+      LaunchPredict(
+          this->ctx_, proxy, model,
+          [&](auto &&policy) {
+            if constexpr (std::is_same_v<AdapterT, data::ColumnarAdapter>) {
+              auto view =
+                  AdapterView{x.get(), missing, policy.MakeAccessor(ctx_, x->Cats(), model)};
+              kernel(view);
+            } else {
+              auto view = AdapterView{x.get(), missing, NoOpAccessor{}};
+              kernel(view);
             }
-          }
-          launch(NoOpAccessor{});
-        },
-        &type_error);
+          },
+          [&](auto) {
+            if constexpr (std::is_same_v<AdapterT, data::ColumnarAdapter>) {
+              return !x->Cats().Empty();
+            } else {
+              return false;
+            }
+          });
+    };
+
+    bool type_error = false;
+    data::cpu_impl::DispatchAny<false>(proxy, dispatch, &type_error);
     return !type_error;
   }
 
@@ -851,61 +941,41 @@ class CPUPredictor : public Predictor {
     // number of valid trees
     ntree_limit = GetTreeLimit(model.trees, ntree_limit);
     const MetaInfo &info = p_fmat->Info();
-    std::vector<bst_float> &preds = out_preds->HostVector();
+    std::vector<float> &preds = out_preds->HostVector();
     preds.resize(info.num_row_ * ntree_limit);
 
     if (p_fmat->Info().IsColumnSplit()) {
-      CHECK(!model.learner_model_param->IsVectorLeaf())
-          << "Predict leaf with column split" << MTNotImplemented();
-      CHECK(!model.Cats()->HasCategorical())
-          << "Categorical feature is not yet supported with column-split.";
       ColumnSplitHelper helper(n_threads, model, 0, ntree_limit);
       helper.PredictLeaf(ctx_, p_fmat, &preds);
       return;
     }
 
-    std::vector<RegTree::FVec> feat_vecs;
-    const int n_features = model.learner_model_param->num_feature;
-    InitThreadTemp(n_threads, &feat_vecs);
-
-    auto launch = [&](SparsePage const &page, auto &&acc) {
-      using Enc = std::remove_reference_t<decltype(acc)>;  // The encoder.
-      auto view_impl = page.GetView();
-      common::ParallelFor(page.Size(), n_threads, [&](auto i) {
-        auto tid = omp_get_thread_num();
-        auto ridx = static_cast<bst_idx_t>(page.base_rowid + i);
-        RegTree::FVec &feats = feat_vecs[tid];
-        if (feats.Size() == 0) {
-          feats.Init(n_features);
-        }
-        SparsePageView view{view_impl, page.base_rowid, std::forward<Enc>(acc)};
-        view.Fill(i, &feats);
-
-        for (bst_tree_t j = 0; j < ntree_limit; ++j) {
-          auto const &tree = *model.trees[j];
-          auto const &cats = tree.GetCategoriesMatrix();
-          bst_node_t nidx;
-          if (tree.IsMultiTarget()) {
-            nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), feats, cats);
-          } else {
-            nidx = scalar::GetLeafIndex<true, true>(tree, feats, cats);
+    auto n_features = model.learner_model_param->num_feature;
+    ThreadTmp<1> feat_vecs{n_threads};
+
+    LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
+      policy.ForEachBatch([&](auto &&batch) {
+        common::ParallelFor1d<1>(batch.Size(), n_threads, [&](auto &&block) {
+          auto ridx = static_cast<bst_idx_t>(batch.base_rowid + block.begin());
+          auto fvec_tloc = feat_vecs.ThreadBuffer(block.Size());
+          batch.FVecFill(block, n_features, fvec_tloc);
+
+          for (bst_tree_t j = 0; j < ntree_limit; ++j) {
+            auto const &tree = *model.trees[j];
+            auto const &cats = tree.GetCategoriesMatrix();
+            bst_node_t nidx;
+            if (tree.IsMultiTarget()) {
+              nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), fvec_tloc.front(),
+                                                     cats);
+            } else {
+              nidx = scalar::GetLeafIndex<true, true>(tree, fvec_tloc.front(), cats);
+            }
+            preds[ridx * ntree_limit + j] = static_cast<float>(nidx);
           }
-          preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
-        }
-        feats.Drop();
+          batch.FVecDrop(fvec_tloc);
+        });
       });
-    };
-
-    // Start collecting the prediction
-    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-      // parallel over local batch
-      if (model.Cats()->HasCategorical() && p_fmat->Cats()->NeedRecode()) {
-        auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
-        launch(batch, std::move(acc));
-      } else {
-        launch(batch, NoOpAccessor{});
-      }
-    }
+    });
   }
 
   void PredictContribution(DMatrix *p_fmat, HostDeviceVector<float> *out_contribs,
@@ -917,8 +987,7 @@ class CPUPredictor : public Predictor {
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     auto const n_threads = this->ctx_->Threads();
-    std::vector<RegTree::FVec> feat_vecs;
-    InitThreadTemp(n_threads, &feat_vecs);
+    ThreadTmp<1> feat_vecs{n_threads};
     const MetaInfo &info = p_fmat->Info();
     // number of valid trees
     ntree_limit = GetTreeLimit(model.trees, ntree_limit);
@@ -935,31 +1004,13 @@ class CPUPredictor : public Predictor {
       FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
     });
 
-    auto launch = [&](auto &&acc) {
-      // Start collecting the contributions
-      using Enc = std::remove_reference_t<decltype(acc)>;  // The encoder.
-      if (!p_fmat->PageExists<SparsePage>()) {
-        auto ft = p_fmat->Info().feature_types.ConstHostVector();
-        for (const auto &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
-          PredictContributionKernel(GHistIndexMatrixView{batch, std::forward<Enc>(acc), ft}, info,
-                                    model, tree_weights, &mean_values, &feat_vecs, &contribs,
-                                    ntree_limit, approximate, condition, condition_feature);
-        }
-      } else {
-        for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-          PredictContributionKernel(
-              SparsePageView{batch.GetView(), batch.base_rowid, std::forward<Enc>(acc)}, info,
-              model, tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit, approximate,
-              condition, condition_feature);
-        }
-      }
-    };
-    if (model.Cats()->HasCategorical() && p_fmat->CatsShared()->NeedRecode()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx_, p_fmat->Cats()->HostView(), model.Cats());
-      launch(acc);
-    } else {
-      launch(NoOpAccessor{});
-    }
+    LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
+      policy.ForEachBatch([&](auto &&batch) {
+        PredictContributionKernel(batch, info, model, tree_weights, &mean_values, &feat_vecs,
+                                  &contribs, ntree_limit, approximate, condition,
+                                  condition_feature);
+      });
+    });
   }
 
   void PredictInteractionContributions(DMatrix *p_fmat, HostDeviceVector<float> *out_contribs,
@@ -1017,9 +1068,6 @@ class CPUPredictor : public Predictor {
       }
     }
   }
-
- private:
-  static std::size_t constexpr kBlockOfRowsSize = 64;
 };
 
 XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index a056890ca501..3f8de403637f 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -6,7 +6,6 @@
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 
-#include <any>  // for any, any_cast
 #include <memory>
 
 #include "../collective/allreduce.h"
@@ -21,10 +20,11 @@
 #include "../data/cat_container.cuh"  // for EncPolicy
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
-#include "../data/proxy_dmatrix.h"
 #include "../data/proxy_dmatrix.cuh"  // for DispatchAny
+#include "../data/proxy_dmatrix.h"
 #include "../gbm/gbtree_model.h"
 #include "predict_fn.h"
+#include "utils.h"  // for CheckProxyDMatrix
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/predictor.h"
@@ -650,13 +650,11 @@ void ExtractPaths(Context const* ctx,
 }
 
 namespace {
-template <size_t kBlockThreads>
-size_t SharedMemoryBytes(size_t cols, size_t max_shared_memory_bytes) {
-  // No way max_shared_memory_bytes that is equal to 0.
-  CHECK_GT(max_shared_memory_bytes, 0);
-  size_t shared_memory_bytes =
-      static_cast<size_t>(sizeof(float) * cols * kBlockThreads);
-  if (shared_memory_bytes > max_shared_memory_bytes) {
+template <std::size_t kBlockThreads>
+[[nodiscard]] std::size_t SharedMemoryBytes(std::size_t n_features, std::size_t max_shmem_bytes) {
+  CHECK_GT(max_shmem_bytes, 0);
+  size_t shared_memory_bytes = static_cast<size_t>(sizeof(float) * n_features * kBlockThreads);
+  if (shared_memory_bytes > max_shmem_bytes) {
     shared_memory_bytes = 0;
   }
   return shared_memory_bytes;
@@ -930,7 +928,7 @@ void LaunchPredictKernel(Context const* ctx, bool is_dense, enc::DeviceColumnsVi
   }
 }
 
-// provide configuration for launching the predict kernel.
+// Provide configuration for launching the predict kernel.
 template <std::uint32_t kBlockThreads = 128, bool kUseShared = true>
 class LaunchConfig {
  private:
@@ -944,8 +942,8 @@ class LaunchConfig {
   void LaunchImpl(K&& kernel, Args&&... args) const&& {
     CHECK_NE(this->n_samples_, NotSet());
     auto grid = static_cast<uint32_t>(common::DivRoundUp(this->n_samples_, kBlockThreads));
-    dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes_, ctx_->CUDACtx()->Stream()}(
-        kernel, std::forward<Args>(args)...);
+    dh::LaunchKernel{grid, kBlockThreads, this->shared_memory_bytes_,  // NOLINT
+                     this->ctx_->CUDACtx()->Stream()}(kernel, std::forward<Args>(args)...);
   }
 
   [[nodiscard]] LaunchConfig Grid(bst_idx_t n_samples) const {
@@ -957,15 +955,12 @@ class LaunchConfig {
 
   [[nodiscard]] static std::size_t ConfigureDevice(DeviceOrd const& device) {
     thread_local std::unordered_map<std::int32_t, std::size_t> max_shared;
-    if (device.IsCUDA()) {
-      auto it = max_shared.find(device.ordinal);
-      if (it == max_shared.cend()) {
-        max_shared[device.ordinal] = dh::MaxSharedMemory(device.ordinal);
-        it = max_shared.find(device.ordinal);
-      }
-      return it->second;
+    auto it = max_shared.find(device.ordinal);
+    if (it == max_shared.cend()) {
+      max_shared[device.ordinal] = dh::MaxSharedMemory(device.ordinal);
+      it = max_shared.find(device.ordinal);
     }
-    return 0;
+    return it->second;
   }
 
  public:
@@ -1113,27 +1108,16 @@ class GPUPredictor : public xgboost::Predictor {
   };
 
   template <typename Adapter>
-  void DispatchedInplacePredict(std::any const& x, std::shared_ptr<DMatrix> p_m,
+  void DispatchedInplacePredict(std::shared_ptr<Adapter> m, std::shared_ptr<DMatrix> p_m,
                                 const gbm::GBTreeModel& model, float missing,
                                 PredictionCacheEntry* out_preds, bst_tree_t tree_begin,
                                 bst_tree_t tree_end) const {
-    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
-    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
-        << "Number of columns in data must equal to trained model.";
     CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
         << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
         << "but data is on: " << m->Device().Name();
-    if (p_m) {
-      p_m->Info().num_row_ = m->NumRows();
-      this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
-    } else {
-      MetaInfo info;
-      info.num_row_ = m->NumRows();
-      this->InitOutPredictions(info, &(out_preds->predictions), model);
-    }
+    this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
     out_preds->predictions.SetDevice(m->Device());
-    using BatchT =
-        std::remove_cv_t<std::remove_reference_t<decltype(std::declval<Adapter>().Value())>>;
+    using BatchT = common::GetValueT<decltype(std::declval<Adapter>().Value())>;
 
     auto n_samples = m->NumRows();
     auto n_features = model.learner_model_param->num_feature;
@@ -1162,11 +1146,13 @@ class GPUPredictor : public xgboost::Predictor {
     auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
     CHECK(proxy) << error::InplacePredictProxy();
     bool type_error = false;
-    data::cuda_impl::DispatchAny<false>(proxy, [&](auto x) {
-      using AdapterT = typename decltype(x)::element_type;
-      this->DispatchedInplacePredict<AdapterT>(x, p_m, model, missing, out_preds, tree_begin,
-                                               tree_end);
-    }, &type_error);
+    data::cuda_impl::DispatchAny<false>(
+        proxy,
+        [&](auto x) {
+          CheckProxyDMatrix(x, proxy, model.learner_model_param);
+          this->DispatchedInplacePredict(x, p_m, model, missing, out_preds, tree_begin, tree_end);
+        },
+        &type_error);
     return !type_error;
   }
 
@@ -1174,7 +1160,7 @@ class GPUPredictor : public xgboost::Predictor {
                            const gbm::GBTreeModel& model, bst_tree_t tree_end,
                            std::vector<bst_float> const* tree_weights, bool approximate, int,
                            unsigned) const override {
-    std::string not_implemented{
+    StringView not_implemented{
         "contribution is not implemented in the GPU predictor, use CPU instead."};
     if (approximate) {
       LOG(FATAL) << "Approximated " << not_implemented;
diff --git a/src/predictor/utils.h b/src/predictor/utils.h
new file mode 100644
index 000000000000..b16ac49bc3f9
--- /dev/null
+++ b/src/predictor/utils.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2017-2025, XGBoost Contributors
+ */
+#pragma once
+#include <memory>  // for shared_ptr
+
+#include "../data/proxy_dmatrix.h"  // for DMatrixProxy
+#include "xgboost/data.h"           // for DMatrix
+#include "xgboost/learner.h"        // LearnerModelParam
+
+namespace xgboost::predictor {
+template <typename Adapter>
+void CheckProxyDMatrix(std::shared_ptr<Adapter> m, data::DMatrixProxy const* proxy,
+                       LearnerModelParam const* p) {
+  CHECK(proxy);
+  CHECK(!proxy->Info().IsColumnSplit())
+      << "Inplace predict support for column-wise data split is not yet implemented.";
+  auto n_features_data = m->NumColumns();
+  auto n_features_model = p->num_feature;
+  CHECK_EQ(n_features_data, n_features_model)
+      << "Number of columns in data must equal to the trained model.";
+  CHECK_EQ(proxy->Info().num_row_, m->NumRows());
+  CHECK_EQ(proxy->Info().num_col_, m->NumColumns());
+  CHECK_EQ(proxy->Info().num_nonzero_, 0);  // unknown
+}
+}  // namespace xgboost::predictor
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 6b293bba2665..6f8a38f3fa02 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -1,5 +1,6 @@
 import sys
 from copy import copy
+from typing import Any, Dict, Type
 
 import numpy as np
 import pytest
@@ -8,9 +9,10 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.predict import run_predict_leaf
 
 sys.path.append("tests/python")
-from test_predict import run_predict_leaf  # noqa
+
 from test_predict import run_threaded_predict  # noqa
 
 rng = np.random.RandomState(1994)
@@ -416,12 +418,19 @@ def test_shap_categorical(self):
             np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3
         )
 
-    def test_predict_leaf_basic(self):
-        gpu_leaf = run_predict_leaf("gpu:0")
-        cpu_leaf = run_predict_leaf("cpu")
+    @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
+    def test_predict_leaf_basic(self, DMatrixT: Type[xgb.DMatrix]) -> None:
+        gpu_leaf = run_predict_leaf("cuda", DMatrixT)
+        cpu_leaf = run_predict_leaf("cpu", DMatrixT)
         np.testing.assert_equal(gpu_leaf, cpu_leaf)
 
-    def run_predict_leaf_booster(self, param, num_rounds, dataset):
+    def run_predict_leaf_booster(
+        self,
+        param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+        DMatrixT: Type[xgb.DMatrix],
+    ) -> None:
         param = dataset.set_params(param)
         m = dataset.get_dmat()
         booster = xgb.train(
@@ -435,15 +444,25 @@ def run_predict_leaf_booster(self, param, num_rounds, dataset):
 
         np.testing.assert_equal(cpu_leaf, gpu_leaf)
 
-    @given(predict_parameter_strategy, tm.make_dataset_strategy())
+    @given(
+        predict_parameter_strategy,
+        tm.make_dataset_strategy(),
+        strategies.fixed_dictionaries(
+            {
+                "DMatrixT": strategies.sampled_from([xgb.DMatrix, xgb.QuantileDMatrix]),
+            }
+        ),
+    )
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_predict_leaf_gbtree(self, param: dict, dataset: tm.TestDataset) -> None:
+    def test_predict_leaf_gbtree(
+        self, param: dict, dataset: tm.TestDataset, DMatrixT: Type[xgb.DMatrix]
+    ) -> None:
         # Unsupported for random forest
         if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
             return
 
         param.update({"booster": "gbtree", "tree_method": "hist", "device": "cuda:0"})
-        self.run_predict_leaf_booster(param, 10, dataset)
+        self.run_predict_leaf_booster(param, 10, dataset, DMatrixT)
 
     @given(predict_parameter_strategy, tm.make_dataset_strategy())
     @settings(deadline=None, max_examples=20, print_blob=True)
@@ -453,7 +472,7 @@ def test_predict_leaf_dart(self, param: dict, dataset: tm.TestDataset) -> None:
             return
 
         param.update({"booster": "dart", "tree_method": "hist", "device": "cuda:0"})
-        self.run_predict_leaf_booster(param, 10, dataset)
+        self.run_predict_leaf_booster(param, 10, dataset, xgb.DMatrix)
 
     @pytest.mark.skipif(**tm.no_sklearn())
     @pytest.mark.skipif(**tm.no_pandas())
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index fc330962cde9..23b379467d25 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -1,7 +1,7 @@
 """Tests for running inplace prediction."""
 
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Union
+from typing import List, Type, Union
 
 import numpy as np
 import pandas as pd
@@ -11,7 +11,7 @@
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.testing.data import np_dtypes, pd_dtypes
-from xgboost.testing.shared import validate_leaf_output
+from xgboost.testing.predict import run_predict_leaf
 
 
 def run_threaded_predict(X, rows, predict_func):
@@ -30,60 +30,9 @@ def run_threaded_predict(X, rows, predict_func):
         assert f.result()
 
 
-def run_predict_leaf(device: str) -> np.ndarray:
-    rows = 100
-    cols = 4
-    classes = 5
-    num_parallel_tree = 4
-    num_boost_round = 10
-    rng = np.random.RandomState(1994)
-    X = rng.randn(rows, cols)
-    y = rng.randint(low=0, high=classes, size=rows)
-    m = xgb.DMatrix(X, y)
-    booster = xgb.train(
-        {
-            "num_parallel_tree": num_parallel_tree,
-            "num_class": classes,
-            "tree_method": "hist",
-        },
-        m,
-        num_boost_round=num_boost_round,
-    )
-
-    booster.set_param({"device": device})
-    empty = xgb.DMatrix(np.ones(shape=(0, cols)))
-    empty_leaf = booster.predict(empty, pred_leaf=True)
-    assert empty_leaf.shape[0] == 0
-
-    leaf = booster.predict(m, pred_leaf=True, strict_shape=True)
-    assert leaf.shape[0] == rows
-    assert leaf.shape[1] == num_boost_round
-    assert leaf.shape[2] == classes
-    assert leaf.shape[3] == num_parallel_tree
-
-    validate_leaf_output(leaf, num_parallel_tree)
-
-    n_iters = np.int32(2)
-    sliced = booster.predict(
-        m,
-        pred_leaf=True,
-        iteration_range=(0, n_iters),
-        strict_shape=True,
-    )
-    first = sliced[0, ...]
-
-    assert np.prod(first.shape) == classes * num_parallel_tree * n_iters
-
-    # When there's only 1 tree, the output is a 1 dim vector
-    booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
-    booster.set_param({"device": device})
-    assert booster.predict(m, pred_leaf=True).shape == (rows,)
-
-    return leaf
-
-
-def test_predict_leaf() -> None:
-    run_predict_leaf("cpu")
+@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
+def test_predict_leaf(DMatrixT: Type[xgb.DMatrix]) -> None:
+    run_predict_leaf("cpu", DMatrixT)
 
 
 def test_predict_shape():

From 26856a8836a224214bcfb209ceb10de12ed4e44f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 12 Aug 2025 16:22:52 +0800
Subject: [PATCH 136/224] [enc] Throw error when DMatrix is empty. (#11628)

---
 include/xgboost/collective/result.h           |   3 +-
 python-package/xgboost/dask/__init__.py       |  21 +--
 python-package/xgboost/dask/utils.py          |  20 +++
 python-package/xgboost/testing/dask.py        | 126 ++++++++++--------
 src/collective/result.cc                      |  20 ++-
 src/data/cat_container.cc                     |  26 +++-
 src/data/cat_container.h                      |   4 +-
 src/data/extmem_quantile_dmatrix.cc           |   2 +
 src/data/iterative_dmatrix.cc                 |   5 +-
 src/data/simple_dmatrix.cc                    |   2 +
 src/data/sparse_page_dmatrix.cc               |   3 +-
 tests/cpp/collective/test_allreduce.cc        |  13 +-
 .../test_gpu_with_dask/test_gpu_with_dask.py  |  20 ++-
 13 files changed, 164 insertions(+), 101 deletions(-)

diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h
index c126366a07a0..09285f4f0555 100644
--- a/include/xgboost/collective/result.h
+++ b/include/xgboost/collective/result.h
@@ -159,5 +159,6 @@ template <typename Fn>
   return fn();
 }
 
-void SafeColl(Result const& rc);
+void SafeColl(Result const& rc, char const* file = __builtin_FILE(),
+              std::int32_t line = __builtin_LINE());
 }  // namespace xgboost::collective
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 82f8b5c44c26..2b59551edf39 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -55,7 +55,7 @@
 import logging
 from collections import defaultdict
 from contextlib import contextmanager
-from functools import cache, partial, update_wrapper
+from functools import partial, update_wrapper
 from threading import Thread
 from typing import (
     Any,
@@ -85,8 +85,6 @@
 from dask import dataframe as dd
 from dask.delayed import Delayed
 from distributed import Future
-from packaging.version import Version
-from packaging.version import parse as parse_version
 
 from .. import collective, config
 from .._data_utils import Categories
@@ -124,7 +122,7 @@
 from ..tracker import RabitTracker
 from ..training import train as worker_train
 from .data import _get_dmatrices, no_group_split
-from .utils import get_address_from_user, get_n_threads
+from .utils import _DASK_2024_12_1, _DASK_2025_3_0, get_address_from_user, get_n_threads
 
 _DaskCollection: TypeAlias = Union[da.Array, dd.DataFrame, dd.Series]
 _DataT: TypeAlias = Union[da.Array, dd.DataFrame]  # do not use series as predictor
@@ -174,21 +172,6 @@
 LOGGER = logging.getLogger("[xgboost.dask]")
 
 
-@cache
-def _DASK_VERSION() -> Version:
-    return parse_version(dask.__version__)
-
-
-@cache
-def _DASK_2024_12_1() -> bool:
-    return _DASK_VERSION() >= parse_version("2024.12.1")
-
-
-@cache
-def _DASK_2025_3_0() -> bool:
-    return _DASK_VERSION() >= parse_version("2025.3.0")
-
-
 def _try_start_tracker(
     n_workers: int,
     addrs: List[Union[Optional[str], Optional[Tuple[str, int]]]],
diff --git a/python-package/xgboost/dask/utils.py b/python-package/xgboost/dask/utils.py
index 7f71ca6e3bc2..9026bce1a60e 100644
--- a/python-package/xgboost/dask/utils.py
+++ b/python-package/xgboost/dask/utils.py
@@ -1,10 +1,15 @@
+# pylint: disable=invalid-name
 """Utilities for the XGBoost Dask interface."""
 
 import logging
 import warnings
+from functools import cache as fcache
 from typing import Any, Dict, Optional, Tuple
 
+import dask
 import distributed
+from packaging.version import Version
+from packaging.version import parse as parse_version
 
 from ..collective import Config
 
@@ -97,3 +102,18 @@ def get_address_from_user(
         port = coll_cfg.tracker_port
 
     return host_ip, port
+
+
+@fcache
+def _DASK_VERSION() -> Version:
+    return parse_version(dask.__version__)
+
+
+@fcache
+def _DASK_2024_12_1() -> bool:
+    return _DASK_VERSION() >= parse_version("2024.12.1")
+
+
+@fcache
+def _DASK_2025_3_0() -> bool:
+    return _DASK_VERSION() >= parse_version("2025.3.0")
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index 606b05def171..8eb63d52d8eb 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -1,12 +1,13 @@
+# pylint: disable=invalid-name
 """Tests for dask shared by different test modules."""
 
-from typing import Any, List, Literal, Tuple, cast
+from typing import Any, List, Literal, Tuple, Type, cast
 
 import numpy as np
 import pandas as pd
 from dask import array as da
 from dask import dataframe as dd
-from distributed import Client, get_worker, wait
+from distributed import Client, get_worker
 from packaging.version import parse as parse_version
 from sklearn.datasets import make_classification
 
@@ -17,7 +18,8 @@
 
 from .. import dask as dxgb
 from .._typing import EvalsLog
-from ..dask import _DASK_VERSION, _get_rabit_args
+from ..dask import _get_rabit_args
+from ..dask.utils import _DASK_VERSION
 from .data import make_batches
 from .data import make_categorical as make_cat_local
 from .ordinal import make_recoded
@@ -325,61 +327,77 @@ def pack(**kwargs: Any) -> dd.DataFrame:
 # pylint: disable=too-many-locals
 def run_recode(client: Client, device: Device) -> None:
     """Run re-coding test with the Dask interface."""
-    enc, reenc, y, _, _ = make_recoded(device, n_features=96)
-    workers = get_client_workers(client)
-    denc, dreenc, dy = (
-        dd.from_pandas(enc, npartitions=8).persist(workers=workers),
-        dd.from_pandas(reenc, npartitions=8).persist(workers=workers),
-        da.from_array(y, chunks=(y.shape[0] // 8,)).persist(workers=workers),
-    )
 
-    wait([denc, dreenc, dy])
+    def create_dmatrix(
+        DMatrixT: Type[dxgb.DaskDMatrix], *args: Any, **kwargs: Any
+    ) -> dxgb.DaskDMatrix:
+        if DMatrixT is dxgb.DaskQuantileDMatrix:
+            ref = kwargs.pop("ref", None)
+            return DMatrixT(*args, ref=ref, **kwargs)
 
-    if device == "cuda":
-        denc = denc.to_backend("cudf")
-        dreenc = dreenc.to_backend("cudf")
-        dy = dy.to_backend("cupy")
+        kwargs.pop("ref", None)
+        return DMatrixT(*args, **kwargs)
 
-    Xy = dxgb.DaskQuantileDMatrix(client, denc, dy, enable_categorical=True)
-    Xy_valid = dxgb.DaskQuantileDMatrix(
-        client, dreenc, dy, enable_categorical=True, ref=Xy
-    )
-    # Base model
-    results = dxgb.train(client, {"device": device}, Xy, evals=[(Xy_valid, "Valid")])
+    def run(DMatrixT: Type[dxgb.DaskDMatrix]) -> None:
+        enc, reenc, y, _, _ = make_recoded(device, n_features=96)
+        to = get_client_workers(client)
 
-    # Training continuation
-    Xy = dxgb.DaskQuantileDMatrix(client, denc, dy, enable_categorical=True)
-    Xy_valid = dxgb.DaskQuantileDMatrix(
-        client, dreenc, dy, enable_categorical=True, ref=Xy
-    )
-    results_1 = dxgb.train(
-        client,
-        {"device": device},
-        Xy,
-        evals=[(Xy_valid, "Valid")],
-        xgb_model=results["booster"],
-    )
+        denc, dreenc, dy = (
+            dd.from_pandas(enc, npartitions=8).persist(workers=to),
+            dd.from_pandas(reenc, npartitions=8).persist(workers=to),
+            da.from_array(y, chunks=(y.shape[0] // 8,)).persist(workers=to),
+        )
 
-    # Reversed training continuation
-    Xy = dxgb.DaskQuantileDMatrix(client, dreenc, dy, enable_categorical=True)
-    Xy_valid = dxgb.DaskQuantileDMatrix(
-        client, denc, dy, enable_categorical=True, ref=Xy
-    )
-    results_2 = dxgb.train(
-        client,
-        {"device": device},
-        Xy,
-        evals=[(Xy_valid, "Valid")],
-        xgb_model=results["booster"],
-    )
-    np.testing.assert_allclose(
-        results_1["history"]["Valid"]["rmse"], results_2["history"]["Valid"]["rmse"]
-    )
+        if device == "cuda":
+            denc = denc.to_backend("cudf")
+            dreenc = dreenc.to_backend("cudf")
+            dy = dy.to_backend("cupy")
+
+        Xy = create_dmatrix(DMatrixT, client, denc, dy, enable_categorical=True)
+        Xy_valid = create_dmatrix(
+            DMatrixT, client, dreenc, dy, enable_categorical=True, ref=Xy
+        )
+        # Base model
+        results = dxgb.train(
+            client, {"device": device}, Xy, evals=[(Xy_valid, "Valid")]
+        )
+
+        # Training continuation
+        Xy = create_dmatrix(DMatrixT, client, denc, dy, enable_categorical=True)
+        Xy_valid = create_dmatrix(
+            DMatrixT, client, dreenc, dy, enable_categorical=True, ref=Xy
+        )
+        results_1 = dxgb.train(
+            client,
+            {"device": device},
+            Xy,
+            evals=[(Xy_valid, "Valid")],
+            xgb_model=results["booster"],
+        )
+
+        # Reversed training continuation
+        Xy = create_dmatrix(DMatrixT, client, dreenc, dy, enable_categorical=True)
+        Xy_valid = create_dmatrix(
+            DMatrixT, client, denc, dy, enable_categorical=True, ref=Xy
+        )
+        results_2 = dxgb.train(
+            client,
+            {"device": device},
+            Xy,
+            evals=[(Xy_valid, "Valid")],
+            xgb_model=results["booster"],
+        )
+        np.testing.assert_allclose(
+            results_1["history"]["Valid"]["rmse"], results_2["history"]["Valid"]["rmse"]
+        )
+
+        predt_0 = dxgb.inplace_predict(client, results, denc).compute()
+        predt_1 = dxgb.inplace_predict(client, results, dreenc).compute()
+        assert_allclose(device, predt_0, predt_1)
 
-    predt_0 = dxgb.inplace_predict(client, results, denc).compute()
-    predt_1 = dxgb.inplace_predict(client, results, dreenc).compute()
-    assert_allclose(device, predt_0, predt_1)
+        predt_0 = dxgb.predict(client, results, Xy).compute()
+        predt_1 = dxgb.predict(client, results, Xy_valid).compute()
+        assert_allclose(device, predt_0, predt_1)
 
-    predt_0 = dxgb.predict(client, results, Xy).compute()
-    predt_1 = dxgb.predict(client, results, Xy_valid).compute()
-    assert_allclose(device, predt_0, predt_1)
+    for DMatrixT in [dxgb.DaskDMatrix, dxgb.DaskQuantileDMatrix]:
+        run(DMatrixT)
diff --git a/src/collective/result.cc b/src/collective/result.cc
index 140efa6d8bee..fd0914e4c11f 100644
--- a/src/collective/result.cc
+++ b/src/collective/result.cc
@@ -1,5 +1,5 @@
 /**
- *  Copyright 2024, XGBoost Contributors
+ *  Copyright 2024-2025, XGBoost Contributors
  */
 #include "xgboost/collective/result.h"
 
@@ -65,7 +65,7 @@ void ResultImpl::Concat(std::unique_ptr<ResultImpl> rhs) {
 std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line) {
   dmlc::DateLogger logger;
   if (file && line != -1) {
-    auto name = std::filesystem::path{ file }.filename();
+    auto name = std::filesystem::path{file}.filename();
     return "[" + name.string() + ":" + std::to_string(line) + "|" + logger.HumanDate() +
            "]: " + std::forward<std::string>(msg);
   }
@@ -73,9 +73,19 @@ std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line) {
 }
 }  // namespace detail
 
-void SafeColl(Result const& rc) {
-  if (!rc.OK()) {
-    LOG(FATAL) << rc.Report();
+void SafeColl(Result const& rc, char const* file, std::int32_t line) {
+  if (rc.OK()) {
+    return;
   }
+  if (file && line != -1) {
+    dmlc::DateLogger logger;
+    auto name = std::filesystem::path{file}.filename();
+    LOG(FATAL) << ("[" + name.string() + ":" + std::to_string(line) + "|" + logger.HumanDate() +
+                   "]:\n")
+               << rc.Report();
+    // Return just in case if this function is deep in ctypes callbacks.
+    return;
+  }
+  LOG(FATAL) << rc.Report();
 }
 }  // namespace xgboost::collective
diff --git a/src/data/cat_container.cc b/src/data/cat_container.cc
index 096dfcd67900..f6f0fc0214ae 100644
--- a/src/data/cat_container.cc
+++ b/src/data/cat_container.cc
@@ -9,9 +9,11 @@
 #include <utility>    // for move
 #include <vector>     // for vector
 
-#include "../common/error_msg.h"  // for NoFloatCat
-#include "../encoder/types.h"     // for Overloaded
-#include "xgboost/json.h"         // for Json
+#include "../collective/allreduce.h"         // for Allreduce
+#include "../collective/communicator-inl.h"  // for GetRank, GetWorldSize
+#include "../common/error_msg.h"             // for NoFloatCat
+#include "../encoder/types.h"                // for Overloaded
+#include "xgboost/json.h"                    // for Json
 
 namespace xgboost {
 CatContainer::CatContainer(enc::HostColumnsView const& df, bool is_ref) : CatContainer{} {
@@ -293,4 +295,22 @@ void CatContainer::Sort(Context const* ctx) {
   enc::SortNames(enc::Policy<EncErrorPolicy>{}, view, this->sorted_idx_.HostSpan());
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
+
+void SyncCategories(Context const* ctx, CatContainer* cats, bool is_empty) {
+  CHECK(cats);
+  if (!collective::IsDistributed()) {
+    return;
+  }
+
+  auto rank = collective::GetRank();
+  std::vector<std::int32_t> workers(collective::GetWorldSize(), 0);
+  workers[rank] = is_empty;
+  collective::SafeColl(collective::Allreduce(ctx, &workers, collective::Op::kSum));
+  if (cats->HasCategorical() &&
+      std::any_of(workers.cbegin(), workers.cend(), [](auto v) { return v == 1; })) {
+    LOG(FATAL)
+        << "A worker cannot have empty input when a dataframe with categorical features is used. "
+           "XGBoost cannot infer the categories if the input is empty.";
+  }
+}
 }  // namespace xgboost
diff --git a/src/data/cat_container.h b/src/data/cat_container.h
index 75f8066793fe..66e4c65bf5ce 100644
--- a/src/data/cat_container.h
+++ b/src/data/cat_container.h
@@ -162,7 +162,7 @@ class CatContainer {
    *        this method returns True.
    */
   [[nodiscard]] bool Empty() const;
-  [[nodiscard]] bool NeedRecode() const { return !this->Empty() && !this->is_ref_; }
+  [[nodiscard]] bool NeedRecode() const { return this->HasCategorical() && !this->is_ref_; }
 
   [[nodiscard]] std::size_t NumFeatures() const;
   /**
@@ -263,6 +263,8 @@ struct NoOpAccessor {
   [[nodiscard]] XGBOOST_DEVICE float operator()(Entry const& e) const { return e.fvalue; }
 };
 
+void SyncCategories(Context const* ctx, CatContainer* cats, bool is_empty);
+
 namespace cpu_impl {
 inline auto MakeCatAccessor(Context const* ctx, enc::HostColumnsView const& new_enc,
                             CatContainer const* orig_cats) {
diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc
index 9e0e3f6f0855..3ceda2ffd211 100644
--- a/src/data/extmem_quantile_dmatrix.cc
+++ b/src/data/extmem_quantile_dmatrix.cc
@@ -49,6 +49,8 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix
   }
   this->batch_ = p;
   this->fmat_ctx_ = ctx;
+
+  SyncCategories(&ctx, info_.Cats(), info_.num_row_ == 0);
 }
 
 ExtMemQuantileDMatrix::~ExtMemQuantileDMatrix() {
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index a0d279c8e3ee..888fc7c3c868 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -9,10 +9,11 @@
 #include <utility>    // for move
 #include <vector>     // for vector
 
-#include "../common/categorical.h"  // common::IsCat
+#include "../common/categorical.h"  // for IsCat
 #include "../common/hist_util.h"    // for HistogramCuts
 #include "../tree/param.h"          // FIXME(jiamingy): Find a better way to share this parameter.
 #include "batch_utils.h"            // for RegenGHist
+#include "cat_container.h"          // for SyncCategories
 #include "gradient_index.h"         // for GHistIndexMatrix
 #include "proxy_dmatrix.h"          // for DataIterProxy, DispatchAny
 #include "quantile_dmatrix.h"       // for GetCutsFromRef
@@ -50,6 +51,8 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   this->fmat_ctx_ = ctx;
   this->batch_ = p;
 
+  SyncCategories(&ctx, info_.Cats(), info_.num_row_ == 0);
+
   LOG(INFO) << "Finished constructing the `IterativeDMatrix`: (" << this->Info().num_row_ << ", "
             << this->Info().num_col_ << ", " << this->info_.num_nonzero_ << ").";
 }
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 9855d3ca3c54..a249c99ac515 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -341,6 +341,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   }
   info_.num_nonzero_ = data_vec.size();
 
+  SyncCategories(&ctx, info_.Cats(), info_.num_row_ == 0);
+
   // Sort the index for row partitioners used by variuos tree methods.
   if (!sparse_page_->IsIndicesSorted(ctx.Threads())) {
     sparse_page_->SortIndices(ctx.Threads());
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index b261fda7f264..d6fac5096d6c 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -83,8 +83,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
   iter.Reset();
 
   ext_info_.SetInfo(&ctx, true, &this->info_);
-
   fmat_ctx_ = ctx;
+
+  SyncCategories(&ctx, info_.Cats(), info_.num_row_ == 0);
 }
 
 SparsePageDMatrix::~SparsePageDMatrix() {
diff --git a/tests/cpp/collective/test_allreduce.cc b/tests/cpp/collective/test_allreduce.cc
index 6af659a3f342..6da214475bca 100644
--- a/tests/cpp/collective/test_allreduce.cc
+++ b/tests/cpp/collective/test_allreduce.cc
@@ -114,11 +114,14 @@ TEST_F(AllreduceTest, BitOr) {
 
 TEST_F(AllreduceTest, Restricted) {
   std::int32_t n_workers = std::min(3u, std::thread::hardware_concurrency());
-  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
-                                 std::int32_t r) {
-    AllreduceWorker worker{host, port, timeout, n_workers, r};
-    worker.Restricted();
-  });
+  auto timeout = std::chrono::seconds{4};
+  TestDistributed(
+      n_workers,
+      [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) {
+        AllreduceWorker worker{host, port, timeout, n_workers, r};
+        worker.Restricted();
+      },
+      timeout);
 }
 
 TEST(AllreduceGlobal, Basic) {
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index ef2ab52c15b5..91cf90fb6d5f 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -301,6 +301,7 @@ def test_gpu_approx(
 
     def test_empty_quantile_dmatrix(self, local_cuda_client: Client) -> None:
         client = local_cuda_client
+
         X, y = make_categorical(client, 1, 30, 13)
         X_valid, y_valid = make_categorical(client, 10000, 30, 13)
 
@@ -308,17 +309,14 @@ def test_empty_quantile_dmatrix(self, local_cuda_client: Client) -> None:
         Xy_valid = dxgb.DaskQuantileDMatrix(
             client, X_valid, y_valid, ref=Xy, enable_categorical=True
         )
-        result = dxgb.train(
-            client,
-            {"tree_method": "hist", "device": "cuda", "debug_synchronize": True},
-            Xy,
-            num_boost_round=10,
-            evals=[(Xy_valid, "Valid")],
-        )
-        predt = dxgb.inplace_predict(client, result["booster"], X).compute()
-        np.testing.assert_allclose(y.compute(), predt)
-        rmse = result["history"]["Valid"]["rmse"][-1]
-        assert rmse < 32.0
+        with pytest.raises(ValueError, match="empty"):
+            dxgb.train(
+                client,
+                {"tree_method": "hist", "device": "cuda", "debug_synchronize": True},
+                Xy,
+                num_boost_round=10,
+                evals=[(Xy_valid, "Valid")],
+            )
 
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dask_array(self, local_cuda_client: Client) -> None:

From 2a3d1104f492dc77c19f00df72e2d71efe1a21de Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 18 Aug 2025 20:40:39 +0800
Subject: [PATCH 137/224] [CI] Fix R build. (#11637)

- Fix R test with the latest image from rhub.
- Merge lintr into R tests to avoid re-compiling XGBoost and its dependencies.
---
 .github/workflows/lint.yml    | 27 ---------------------------
 .github/workflows/r_tests.yml | 24 +++++++++++++-----------
 doc/contrib/ci.rst            | 17 +++++++++++++++--
 3 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 73636e7ce66d..e7874875ec90 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -72,30 +72,3 @@ jobs:
         run: |
           python3 ops/script/lint_cpp.py
           bash ops/script/lint_cmake.sh
-
-  lintr:
-    runs-on: ubuntu-latest
-    name: Run R linters on Ubuntu
-    env:
-      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: 'true'
-      - uses: r-lib/actions/setup-r@v2
-        with:
-          r-version: "release"
-      - name: Cache R packages
-        uses: actions/cache@v4
-        with:
-          path: ${{ env.R_LIBS_USER }}
-          key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }}
-          restore-keys: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }}
-      - name: Install dependencies
-        shell: Rscript {0}
-        run: |
-          source("./R-package/tests/helper_scripts/install_deps.R")
-      - name: Run lintr
-        run: |
-          MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/
-          Rscript ops/script/lint_r.R $(pwd)
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 43ad372a1e84..6beff9ef4628 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -42,12 +42,6 @@ jobs:
       - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.r }}
-      - name: Cache R packages
-        uses: actions/cache@v4
-        with:
-          path: ${{ env.R_LIBS_USER }}
-          key: ${{ runner.os }}-r-${{ matrix.r }}-8-${{ hashFiles('R-package/DESCRIPTION') }}
-          restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-8-${{ hashFiles('R-package/DESCRIPTION') }}
       - uses: actions/setup-python@v5
         with:
           python-version: "3.10"
@@ -70,13 +64,12 @@ jobs:
     name: Test R package on Debian
     runs-on: ubuntu-latest
     container:
-      image: rhub/debian-gcc-release
+      image: rhub/ubuntu-release # rhub uses ubuntu for debian tests.
     steps:
       - name: Install system dependencies
         run: |
           # Must run before checkout to have the latest git installed.
-          # No need to add pandoc, the container has it figured out.
-          apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git librsvg2-dev librsvg2-2 -y
+          apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git librsvg2-dev librsvg2-2 pandoc -y
       - name: Trust git cloning project sources
         run: |
           git config --global --add safe.directory "${GITHUB_WORKSPACE}"
@@ -90,8 +83,9 @@ jobs:
       - name: Test R
         shell: bash -l {0}
         run: |
-          python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check
+          python3 ops/script/test_r_package.py --r=/opt/R/release/bin/R --build-tool=autotools --task=check
       - uses: dorny/paths-filter@v3
+        # Run the document check if there are changes in the R package.
         id: changes
         with:
           filters: |
@@ -100,4 +94,12 @@ jobs:
       - name: Run document check
         if: steps.changes.outputs.r_package == 'true'
         run: |
-          python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc
+          python3 ops/script/test_r_package.py --r=/opt/R/release/bin/R --task=doc
+      - name: Run lintr
+        run: |
+          # Prevent the linter from checking generated R scripts.
+          if [ -d ./xgboost.Rcheck ] ; then
+            rm -rf ./xgboost.Rcheck
+          fi
+          MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/
+          Rscript ops/script/lint_r.R $(pwd)
diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
index 1e01ed52fbfd..964d833e4756 100644
--- a/doc/contrib/ci.rst
+++ b/doc/contrib/ci.rst
@@ -15,9 +15,14 @@ project.
 Tips for testing
 ****************
 
-====================================
+=======
+R tests
+=======
+
+------------------------------------
 Running R tests with ``noLD`` option
-====================================
+------------------------------------
+
 You can run R tests using a custom-built R with compilation flag
 ``--disable-long-double``. See `this page <https://blog.r-hub.io/2019/05/21/nold/>`_ for more
 details about noLD. This is a requirement for keeping XGBoost on CRAN (the R package index).
@@ -25,6 +30,14 @@ Unlike other tests, this test must be invoked manually. Simply add a review comm
 ``/gha run r-nold-test`` to a pull request to kick off the test.
 (Ordinary comment won't work. It needs to be a review comment.)
 
+---------------------------------
+Using container images from r-hub
+---------------------------------
+
+The r-hub project `provides <https://github.com/r-hub/containers>`__ a list of container
+`images <https://r-hub.github.io/containers/>`__ for reproducing CRAN environments.
+
+
 ===============================
 Making changes to CI containers
 ===============================

From b3ac20cf038f5ecdbd2306ff11ee8ee8b6aabe6b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 19 Aug 2025 02:14:42 +0800
Subject: [PATCH 138/224] [doc][EM] Provide a starting point on data sharding.
 (#11631)

---
 doc/tutorials/external_memory.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 657b809e7353..0adc06d79afb 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -268,6 +268,17 @@ bandwidths. During initial development of the feature, we used the LPDDR5 480G v
 which has about 350GB/s bandwidth for host to device transfer. When choosing the variant
 for training XGBoost models, one should pay extra attention to the C2C bandwidth.
 
+Here we provide a simple example as a starting point for training with external memory. We
+used this example for one of the benchmarks. To train a model with `2 ^ 29` 32-bit
+floating point samples, `512` features (total 1TB) on a GH200 (a H200 GPU connected to a
+Grace CPU by a chip-to-chip link) system. One can start with:
+- Evenly divide the data into 128 batches with 8GB per batch.
+- Define a custom iterator as previously described.
+- Set the `max_quantile_batches` parameter of the
+  :py:class:`~xgboost.ExtMemQuantileDMatrix` to 32 (256GB per sub-stream for
+  quantization). Load the data.
+- Start training with ``device=cuda``.
+
 To run experiments on these platforms, the open source `NVIDIA Linux driver
 <https://developer.nvidia.com/blog/nvidia-transitions-fully-towards-open-source-gpu-kernel-modules/>`__
 with version ``>=565.47`` is required, it should come with CTK 12.7 and later

From 678578822fe094c9b47d30500c2292377f75c634 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 19 Aug 2025 14:36:23 +0800
Subject: [PATCH 139/224] [CI] Use updated JVM images and NCCL. (#11638)

- Remove the Scala style check.
---
 jvm-packages/checkstyle.xml        |  4 ----
 jvm-packages/pom.xml               | 25 ++-----------------------
 jvm-packages/xgboost4j/pom.xml     | 14 +++++++-------
 tests/cpp/collective/test_worker.h |  2 +-
 tests/python/test_data_iterator.py |  1 +
 5 files changed, 11 insertions(+), 35 deletions(-)

diff --git a/jvm-packages/checkstyle.xml b/jvm-packages/checkstyle.xml
index 88ae2122e279..ebfd7cd88531 100644
--- a/jvm-packages/checkstyle.xml
+++ b/jvm-packages/checkstyle.xml
@@ -48,10 +48,6 @@
 
     <property name="fileExtensions" value="java, properties, xml"/>
 
-		<module name="SuppressionFilter">
-			  <property name="file" value="checkstyle-suppressions.xml"/>
-		</module>
-
     <!-- Checks for whitespace                               -->
     <!-- See http://checkstyle.sf.net/config_whitespace.html -->
     <module name="FileTabCharacter">
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 16347e85f1fc..882f43b1b3ce 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -286,29 +286,6 @@
         </pluginManagement>
 
         <plugins>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-                <version>1.0.0</version>
-                <configuration>
-                    <verbose>false</verbose>
-                    <failOnViolation>true</failOnViolation>
-                    <includeTestSourceDirectory>true</includeTestSourceDirectory>
-                    <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
-                    <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
-                    <configLocation>scalastyle-config.xml</configLocation>
-                    <outputEncoding>UTF-8</outputEncoding>
-                </configuration>
-                <executions>
-                    <execution>
-                        <id>checkstyle</id>
-                        <phase>validate</phase>
-                        <goals>
-                            <goal>check</goal>
-                        </goals>
-                    </execution>
-                </executions>
-            </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-site-plugin</artifactId>
@@ -320,6 +297,8 @@
                 <version>3.6.0</version>
                 <configuration>
                     <configLocation>checkstyle.xml</configLocation>
+                    <suppressionsLocation>checkstyle-suppressions.xml</suppressionsLocation>
+                    <suppressionsFileExpression>checkstyle.suppressions.file</suppressionsFileExpression>
                     <failOnViolation>true</failOnViolation>
                 </configuration>
                 <executions>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index b9c144dd044f..a828cd44a005 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -64,13 +64,13 @@
     <build>
       <plugins>
           <plugin>
-              <groupId>org.apache.maven.plugins</groupId>
-              <artifactId>maven-javadoc-plugin</artifactId>
-              <version>3.11.1</version>
-              <configuration>
-                  <show>protected</show>
-                  <nohelp>true</nohelp>
-              </configuration>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-javadoc-plugin</artifactId>
+            <version>3.11.3</version>
+            <configuration>
+              <show>protected</show>
+              <nohelp>true</nohelp>
+            </configuration>
           </plugin>
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index ed511675bade..29d4f8998276 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -113,7 +113,7 @@ inline Json MakeTrackerConfig(std::string host, std::int32_t n_workers,
 
 template <typename WorkerFn>
 void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn,
-                     std::chrono::seconds timeout = std::chrono::seconds{2}) {
+                     std::chrono::seconds timeout = std::chrono::seconds{3}) {
   std::string host;
   auto rc = GetHostAddress(&host);
   SafeColl(rc);
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
index 6a980f967a97..b6692f4feae3 100644
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -319,6 +319,7 @@ def test_quantile_objective(
     strategies.integers(2, 16),
 )
 @settings(deadline=None, max_examples=10, print_blob=True)
+@tm.timeout(45)
 def test_extmem_qdm(
     n_samples_per_batch: int, n_features: int, n_batches: int, n_bins: int
 ) -> None:

From 59ed4ca5181ada8cde698e495a52b3aa6ec33352 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 19 Aug 2025 17:33:50 +0800
Subject: [PATCH 140/224] [mt] Fix demo evaluation metric. (#11630)

- Fix metric.
- Cleanup the CPU implementation.
---
 demo/guide-python/multioutput_regression.py |  6 +++++-
 src/metric/elementwise_metric.cu            | 12 ++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py
index cc64e4e09680..a577d0a56e22 100644
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@@ -87,7 +87,7 @@ def squared_log(
 
     def rmse(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
         y = dtrain.get_label().reshape(predt.shape)
-        v = np.sqrt(np.sum(np.power(y - predt, 2)))
+        v = np.sqrt(np.mean(np.power(y - predt, 2)))
         return "PyRMSE", v
 
     X, y = gen_circle()
@@ -114,6 +114,10 @@ def rmse(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
     if plot_result:
         plot_predt(y, y_predt, "multi")
 
+    np.testing.assert_allclose(
+        results["Train"]["rmse"], results["Train"]["PyRMSE"], rtol=1e-2
+    )
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 6526a2f589eb..98329153bd70 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -78,15 +78,10 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss,
     // - sqrt(avg_t0) + sqrt(avg_t1) + ... sqrt(avg_tm)  // distributed
 
     auto size = info.labels.Size() * num_preds;
-    auto const kBlockSize = 2048;
-    auto n_blocks = size / kBlockSize + 1;
-
-    common::ParallelFor(n_blocks, n_threads, [&](auto block_idx) {
-      const size_t begin = block_idx * kBlockSize;
-      const size_t end = std::min(size, begin + kBlockSize);
-
+    std::size_t constexpr kBlockSize = 2048;
+    common::ParallelFor1d<kBlockSize>(size, n_threads, [&](auto&& block) {
       double sum_score = 0, sum_weight = 0;
-      for (std::size_t i = begin; i < end; ++i) {
+      for (std::size_t i = block.begin(), n = block.end(); i < n; ++i) {
         auto [sample_id, target_id] = linalg::UnravelIndex(i, labels.Shape());
 
         auto [v, wt] = loss(i, sample_id, target_id);
@@ -98,6 +93,7 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss,
       score_tloc[t_idx] += sum_score;
       weight_tloc[t_idx] += sum_weight;
     });
+
     double residue_sum = std::accumulate(score_tloc.cbegin(), score_tloc.cend(), 0.0);
     double weights_sum = std::accumulate(weight_tloc.cbegin(), weight_tloc.cend(), 0.0);
     result = PackedReduceResult{residue_sum, weights_sum};

From 6c1b499d2bcfa9690b75da4bec32b6f26b0bc6d1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 19 Aug 2025 17:34:26 +0800
Subject: [PATCH 141/224] [mt] Implement tree view. (#11625)

---
 include/xgboost/multi_target_tree_model.h     |  72 ++++++-
 src/tree/multi_target_tree_model.cc           | 177 +++++++++++-------
 .../cpp/tree/test_multi_target_tree_model.cc  |  31 ++-
 3 files changed, 204 insertions(+), 76 deletions(-)

diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index 430c5455f1e9..c44b43e4a3c8 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -9,22 +9,66 @@
 #include <xgboost/base.h>                // for bst_node_t, bst_target_t, bst_feature_t
 #include <xgboost/context.h>             // for Context
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
-#include <xgboost/linalg.h>              // for VectorView
+#include <xgboost/linalg.h>              // for VectorView, MatrixView
 #include <xgboost/model.h>               // for Model
 #include <xgboost/span.h>                // for Span
 
 #include <cstddef>  // for size_t
 #include <cstdint>  // for uint8_t
+#include <mutex>    // for mutex
 #include <vector>   // for vector
 
 namespace xgboost {
 struct TreeParam;
+/**
+ * @brief A view to the @MultiTargetTree suitable for both host and device.
+ */
+struct MultiTargetTreeView {
+  static bst_node_t constexpr InvalidNodeId() { return -1; }
+
+  bst_node_t const* left;
+  bst_node_t const* right;
+  bst_node_t const* parent;
+
+  bst_feature_t const* split_index;
+  std::uint8_t const* default_left;
+  float const* split_conds;
+
+  // The number of nodes
+  std::size_t n{0};
+
+  linalg::MatrixView<float const> weights;
+
+  [[nodiscard]] XGBOOST_DEVICE bool IsLeaf(bst_node_t nidx) const {
+    return left[nidx] == InvalidNodeId();
+  }
+
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t LeftChild(bst_node_t nidx) const { return left[nidx]; }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t RightChild(bst_node_t nidx) const { return right[nidx]; }
+  [[nodiscard]] XGBOOST_DEVICE bst_feature_t SplitIndex(bst_node_t nidx) const {
+    return split_index[nidx];
+  }
+  [[nodiscard]] XGBOOST_DEVICE float SplitCond(bst_node_t nidx) const { return split_conds[nidx]; }
+  [[nodiscard]] XGBOOST_DEVICE bool DefaultLeft(bst_node_t nidx) const {
+    return default_left[nidx];
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t DefaultChild(bst_node_t nidx) const {
+    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
+  }
+  [[nodiscard]] XGBOOST_DEVICE linalg::VectorView<float const> LeafValue(bst_node_t nidx) const {
+    return this->weights.Slice(nidx, linalg::All());
+  }
+
+  [[nodiscard]] bst_target_t NumTargets() const { return this->weights.Shape(1); }
+  [[nodiscard]] bst_node_t Size() const { return this->n; }
+};
+
 /**
  * @brief Tree structure for multi-target model.
  */
 class MultiTargetTree : public Model {
  public:
-  static bst_node_t constexpr InvalidNodeId() { return -1; }
+  static bst_node_t constexpr InvalidNodeId() { return MultiTargetTreeView::InvalidNodeId(); }
 
  private:
   TreeParam const* param_;
@@ -36,14 +80,16 @@ class MultiTargetTree : public Model {
   HostDeviceVector<float> split_conds_;
   HostDeviceVector<float> weights_;
 
+  mutable std::mutex tree_view_lock_;
+
   [[nodiscard]] linalg::VectorView<float const> NodeWeight(bst_node_t nidx) const {
-    auto beg = nidx * this->NumTarget();
-    auto v = this->weights_.ConstHostSpan().subspan(beg, this->NumTarget());
+    auto beg = nidx * this->NumTargets();
+    auto v = this->weights_.ConstHostSpan().subspan(beg, this->NumTargets());
     return linalg::MakeTensorView(DeviceOrd::CPU(), v, v.size());
   }
   [[nodiscard]] linalg::VectorView<float> NodeWeight(bst_node_t nidx) {
-    auto beg = nidx * this->NumTarget();
-    auto v = this->weights_.HostSpan().subspan(beg, this->NumTarget());
+    auto beg = nidx * this->NumTargets();
+    auto v = this->weights_.HostSpan().subspan(beg, this->NumTargets());
     return linalg::MakeTensorView(DeviceOrd::CPU(), v, v.size());
   }
 
@@ -51,8 +97,8 @@ class MultiTargetTree : public Model {
   explicit MultiTargetTree(TreeParam const* param);
   MultiTargetTree(MultiTargetTree const& that);
   MultiTargetTree& operator=(MultiTargetTree const& that) = delete;
-  MultiTargetTree(MultiTargetTree&& that) = default;
-  MultiTargetTree& operator=(MultiTargetTree&& that) = default;
+  MultiTargetTree(MultiTargetTree&& that) = delete;
+  MultiTargetTree& operator=(MultiTargetTree&& that) = delete;
 
   /**
    * @brief Set the weight for a leaf.
@@ -92,7 +138,7 @@ class MultiTargetTree : public Model {
     return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
   }
 
-  [[nodiscard]] bst_target_t NumTarget() const;
+  [[nodiscard]] bst_target_t NumTargets() const;
 
   [[nodiscard]] std::size_t Size() const;
 
@@ -109,9 +155,17 @@ class MultiTargetTree : public Model {
     CHECK(IsLeaf(nidx));
     return this->NodeWeight(nidx);
   }
+  /**
+   * @brief Get a view to the tree.
+   *
+   *   This method is NOT thread-safe.
+   */
+  [[nodiscard]] MultiTargetTreeView View(Context const* ctx) const;
 
   void LoadModel(Json const& in) override;
   void SaveModel(Json* out) const override;
+
+  [[nodiscard]] std::size_t MemCostBytes() const;
 };
 }  // namespace xgboost
 #endif  // XGBOOST_MULTI_TARGET_TREE_MODEL_H_
diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc
index 7c040382ef22..80d30757ef61 100644
--- a/src/tree/multi_target_tree_model.cc
+++ b/src/tree/multi_target_tree_model.cc
@@ -48,6 +48,72 @@ MultiTargetTree::MultiTargetTree(MultiTargetTree const& that)
   this->weights_.Copy(that.weights_);
 }
 
+
+void MultiTargetTree::SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
+  CHECK(this->IsLeaf(nidx)) << "Collapsing a split node to leaf " << MTNotImplemented();
+  auto const next_nidx = nidx + 1;
+  CHECK_EQ(weight.Size(), this->NumTargets());
+  CHECK_GE(weights_.Size(), next_nidx * weight.Size());
+  auto out_weight = weights_.HostSpan().subspan(nidx * weight.Size(), weight.Size());
+  for (std::size_t i = 0; i < weight.Size(); ++i) {
+    out_weight[i] = weight(i);
+  }
+}
+
+void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float split_cond,
+                             bool default_left, linalg::VectorView<float const> base_weight,
+                             linalg::VectorView<float const> left_weight,
+                             linalg::VectorView<float const> right_weight) {
+  CHECK(this->IsLeaf(nidx));
+  CHECK_GE(parent_.Size(), 1);
+  CHECK_EQ(parent_.Size(), left_.Size());
+  CHECK_EQ(left_.Size(), right_.Size());
+
+  std::size_t n = param_->num_nodes + 2;
+  CHECK_LT(split_idx, this->param_->num_feature);
+  left_.Resize(n, InvalidNodeId());
+  right_.Resize(n, InvalidNodeId());
+  parent_.Resize(n, InvalidNodeId());
+
+  auto left_child = parent_.Size() - 2;
+  auto right_child = parent_.Size() - 1;
+
+  CHECK_NE(left_child, nidx);
+  left_.HostVector()[nidx] = left_child;
+  right_.HostVector()[nidx] = right_child;
+
+  auto& h_parent = parent_.HostVector();
+  if (nidx != 0) {
+    CHECK_NE(h_parent[nidx], InvalidNodeId());
+  }
+
+  h_parent[left_child] = nidx;
+  h_parent[right_child] = nidx;
+
+  split_index_.Resize(n);
+  split_index_.HostVector()[nidx] = split_idx;
+
+  split_conds_.Resize(n, DftBadValue());
+  split_conds_.HostVector()[nidx] = split_cond;
+
+  default_left_.Resize(n);
+  default_left_.HostVector()[nidx] = static_cast<std::uint8_t>(default_left);
+
+  weights_.Resize(n * this->NumTargets());
+  auto p_weight = this->NodeWeight(nidx);
+  CHECK_EQ(p_weight.Size(), base_weight.Size());
+  auto l_weight = this->NodeWeight(left_child);
+  CHECK_EQ(l_weight.Size(), left_weight.Size());
+  auto r_weight = this->NodeWeight(right_child);
+  CHECK_EQ(r_weight.Size(), right_weight.Size());
+
+  for (std::size_t i = 0; i < base_weight.Size(); ++i) {
+    p_weight(i) = base_weight(i);
+    l_weight(i) = left_weight(i);
+    r_weight(i) = right_weight(i);
+  }
+}
+
 template <bool typed, bool feature_is_64>
 void LoadModelImpl(Json const& in, HostDeviceVector<float>* p_weights,
                    HostDeviceVector<bst_node_t>* p_lefts, HostDeviceVector<bst_node_t>* p_rights,
@@ -95,6 +161,38 @@ void LoadModelImpl(Json const& in, HostDeviceVector<float>* p_weights,
   }
 }
 
+MultiTargetTreeView MultiTargetTree::View(Context const* ctx) const {
+  CHECK_GE(this->NumTargets(), 2);
+  CHECK_EQ(this->left_.Size(), this->right_.Size());
+  CHECK_EQ(this->left_.Size(), this->parent_.Size());
+
+  auto device = ctx->Device();
+  auto n = this->left_.Size();
+
+  // Data copies between host and device can introduce race.
+  std::lock_guard guard{this->tree_view_lock_};
+
+  auto make_ten = [&](common::Span<float const> weights) {
+    auto n_targets = this->NumTargets();
+    auto n_leaves = this->weights_.Size() / this->NumTargets();
+    CHECK_GE(n_leaves, 1);
+    return linalg::MakeTensorView(ctx, weights, n_leaves, n_targets);
+  };
+
+  auto make_tr = [&](auto const&... args) -> MultiTargetTreeView {
+    if (device.IsCPU()) {
+      return {(args.ConstHostPointer())..., n, make_ten(this->weights_.ConstHostSpan())};
+    }
+
+    (args.SetDevice(device), ...);
+    this->weights_.SetDevice(device);
+    return {(args.ConstDevicePointer())..., n, make_ten(this->weights_.ConstDeviceSpan())};
+  };
+
+  return make_tr(this->left_, this->right_, this->parent_, this->split_index_, this->default_left_,
+                 this->split_conds_);
+}
+
 void MultiTargetTree::LoadModel(Json const& in) {
   namespace tf = tree_field;
   bool typed = IsA<F32Array>(in[tf::kBaseWeight]);
@@ -127,7 +225,7 @@ void MultiTargetTree::SaveModel(Json* p_out) const {
   I32Array parents(n_nodes);
   F32Array conds(n_nodes);
   U8Array default_left(n_nodes);
-  F32Array weights(n_nodes * this->NumTarget());
+  F32Array weights(n_nodes * this->NumTargets());
 
   auto const& h_left = this->left_.ConstHostVector();
   auto const& h_right = this->right_.ConstHostVector();
@@ -151,7 +249,7 @@ void MultiTargetTree::SaveModel(Json* p_out) const {
 
       auto in_weight = this->NodeWeight(nidx);
       auto weight_out = common::Span<float>(weights.GetArray())
-                            .subspan(nidx * this->NumTarget(), this->NumTarget());
+                            .subspan(nidx * this->NumTargets(), this->NumTargets());
       CHECK_EQ(in_weight.Size(), weight_out.size());
       std::copy_n(in_weight.Values().data(), in_weight.Size(), weight_out.data());
     }
@@ -179,70 +277,19 @@ void MultiTargetTree::SaveModel(Json* p_out) const {
   out[tf::kDftLeft] = std::move(default_left);
 }
 
-void MultiTargetTree::SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
-  CHECK(this->IsLeaf(nidx)) << "Collapsing a split node to leaf " << MTNotImplemented();
-  auto const next_nidx = nidx + 1;
-  CHECK_EQ(weight.Size(), this->NumTarget());
-  CHECK_GE(weights_.Size(), next_nidx * weight.Size());
-  auto out_weight = weights_.HostSpan().subspan(nidx * weight.Size(), weight.Size());
-  for (std::size_t i = 0; i < weight.Size(); ++i) {
-    out_weight[i] = weight(i);
-  }
-}
-
-void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float split_cond,
-                             bool default_left, linalg::VectorView<float const> base_weight,
-                             linalg::VectorView<float const> left_weight,
-                             linalg::VectorView<float const> right_weight) {
-  CHECK(this->IsLeaf(nidx));
-  CHECK_GE(parent_.Size(), 1);
-  CHECK_EQ(parent_.Size(), left_.Size());
-  CHECK_EQ(left_.Size(), right_.Size());
-
-  std::size_t n = param_->num_nodes + 2;
-  CHECK_LT(split_idx, this->param_->num_feature);
-  left_.Resize(n, InvalidNodeId());
-  right_.Resize(n, InvalidNodeId());
-  parent_.Resize(n, InvalidNodeId());
 
-  auto left_child = parent_.Size() - 2;
-  auto right_child = parent_.Size() - 1;
-
-  left_.HostVector()[nidx] = left_child;
-  right_.HostVector()[nidx] = right_child;
-
-  auto& h_parent = parent_.HostVector();
-  if (nidx != 0) {
-    CHECK_NE(h_parent[nidx], InvalidNodeId());
-  }
-
-  h_parent[left_child] = nidx;
-  h_parent[right_child] = nidx;
-
-  split_index_.Resize(n);
-  split_index_.HostVector()[nidx] = split_idx;
-
-  split_conds_.Resize(n, DftBadValue());
-  split_conds_.HostVector()[nidx] = split_cond;
-
-  default_left_.Resize(n);
-  default_left_.HostVector()[nidx] = static_cast<std::uint8_t>(default_left);
-
-  weights_.Resize(n * this->NumTarget());
-  auto p_weight = this->NodeWeight(nidx);
-  CHECK_EQ(p_weight.Size(), base_weight.Size());
-  auto l_weight = this->NodeWeight(left_child);
-  CHECK_EQ(l_weight.Size(), left_weight.Size());
-  auto r_weight = this->NodeWeight(right_child);
-  CHECK_EQ(r_weight.Size(), right_weight.Size());
+bst_target_t MultiTargetTree::NumTargets() const { return param_->size_leaf_vector; }
+std::size_t MultiTargetTree::Size() const { return parent_.Size(); }
 
-  for (std::size_t i = 0; i < base_weight.Size(); ++i) {
-    p_weight(i) = base_weight(i);
-    l_weight(i) = left_weight(i);
-    r_weight(i) = right_weight(i);
-  }
+[[nodiscard]] std::size_t MultiTargetTree::MemCostBytes() const {
+  std::size_t n_bytes = 0;
+  n_bytes += left_.SizeBytes();
+  n_bytes += right_.SizeBytes();
+  n_bytes += parent_.SizeBytes();
+  n_bytes += split_index_.SizeBytes();
+  n_bytes += default_left_.SizeBytes();
+  n_bytes += split_conds_.SizeBytes();
+  n_bytes += weights_.SizeBytes();
+  return n_bytes;
 }
-
-bst_target_t MultiTargetTree::NumTarget() const { return param_->size_leaf_vector; }
-std::size_t MultiTargetTree::Size() const { return parent_.Size(); }
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index 2f01e05de0e2..79560f17f097 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -1,11 +1,13 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/context.h>     // for Context
+#include <xgboost/context.h>  // for Context
 #include <xgboost/multi_target_tree_model.h>
 #include <xgboost/tree_model.h>  // for RegTree
 
+#include "../helpers.h"
+
 namespace xgboost {
 namespace {
 auto MakeTreeForTest() {
@@ -75,4 +77,29 @@ TEST(MultiTargetTree, DumpDot) {
     ASSERT_NE(str.find("leaf=[1, 2, ..., 4]"), std::string::npos);
   }
 }
+
+TEST(MultiTargetTree, View) {
+  auto tree = MakeTreeForTest();
+
+  auto test = [&tree](Context const* ctx) {
+    auto v = tree->GetMultiTargetTree()->View(ctx);
+    ASSERT_EQ(v.NumTargets(), 3);
+    ASSERT_EQ(v.Size(), 3);
+    if (ctx->IsCPU()) {
+      ASSERT_EQ(v.LeftChild(0), 1);
+      ASSERT_EQ(v.RightChild(0), 2);
+    }
+  };
+
+  {
+    Context ctx;
+    test(&ctx);
+  }
+#if defined(XGBOOST_USE_CUDA)
+  {
+    auto ctx = MakeCUDACtx(0);
+    test(&ctx);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+}
 }  // namespace xgboost

From 1a1786f6ed2fbe7edc3136d749e230c5d143861d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 20 Aug 2025 05:37:36 +0800
Subject: [PATCH 142/224] [EM] Retry if quantile merge failed. (#11641)

---
 src/common/quantile.cu       | 11 ++++++++++-
 src/data/quantile_dmatrix.cu |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 222d7b309d89..535a2a700e89 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -464,7 +464,16 @@ void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_colum
     return;
   }
 
-  this->Other().resize(this->Current().size() + that.size());
+  std::size_t new_size = this->Current().size() + that.size();
+  try {
+    this->Other().resize(new_size);
+  } catch (dmlc::Error const &) {
+    // Retry
+    this->Other().clear();
+    this->Other().shrink_to_fit();
+    this->Other().resize(new_size);
+  }
+
   CHECK_EQ(d_that_columns_ptr.size(), this->columns_ptr_.Size());
 
   MergeImpl(ctx, this->Data(), this->ColumnsPtr(), that, d_that_columns_ptr,
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index fd3d5b251d32..9a9963f7cc3f 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -73,7 +73,7 @@ void MakeSketches(Context const* ctx,
     /**
      * Get the data shape.
      */
-    // We use do while here as the first batch is fetched in ctor
+    // We use do while here as the first batch has been fetched in the ctor
     CHECK_LT(ctx->Ordinal(), curt::AllVisibleGPUs());
     auto device = dh::GetDevice(ctx);
     curt::SetDevice(device.ordinal);

From 4957244419a7cdd00d1de78653c317c96f908f03 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 20 Aug 2025 11:40:53 -0700
Subject: [PATCH 143/224] [CI] Disable spot pricing for GPU workers (#11644)

---
 .github/runs-on.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index bdcdabf45204..1d97b8c5de21 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -25,9 +25,11 @@ runners:
   linux-amd64-gpu:
     family: ["g4dn.xlarge"]
     image: linux-amd64
+    spot: "false"
   linux-amd64-mgpu:
     family: ["g4dn.12xlarge"]
     image: linux-amd64
+    spot: "false"
   linux-arm64-cpu:
     cpu: 16
     family: ["c6g", "c7g"]
@@ -35,6 +37,7 @@ runners:
   windows-gpu:
     family: ["g4dn.2xlarge"]
     image: windows-amd64
+    spot: "false"
   windows-cpu:
     cpu: 32
     family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"]

From 816b46bb9a951bd476ae8215de965def95c26a8a Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 20 Aug 2025 19:22:50 -0700
Subject: [PATCH 144/224] [CI] Fix variant wheel build (#11645)

---
 ops/pipeline/build-variant-wheels-impl.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ops/pipeline/build-variant-wheels-impl.sh b/ops/pipeline/build-variant-wheels-impl.sh
index d8499f09f8a9..5535674ec685 100755
--- a/ops/pipeline/build-variant-wheels-impl.sh
+++ b/ops/pipeline/build-variant-wheels-impl.sh
@@ -17,7 +17,9 @@ source activate wheelnext
 set -xu
 
 python -m pip install -v \
-  git+https://github.com/wheelnext/pep_xxx_wheel_variants.git@f3b287090f8a6f510b0e1723896e1c7e638f6bff#subdirectory=pep_xxx_wheel_variants
-pip config set --site global.index-url https://variants-index.wheelnext.dev/
-variantlib make-variant -f python-package/dist/xgboost-*.whl \
-  -p "nvidia :: cuda :: 12" -o . --pyproject-toml python-package/pyproject.toml
+  git+https://github.com/wheelnext/pep_xxx_wheel_variants.git@25ea4b6d0060d2263d8ec674dd96feffbae78081#subdirectory=pep_xxx_wheel_variants
+python -m pip install "nvidia-variant-provider @ git+https://github.com/wheelnext/nvidia-variant-provider.git@efc215a95f211276587c8a63617dc6dca5f19363"
+variantlib make-variant --no-isolation -f python-package/dist/xgboost-*.whl \
+  -p "nvidia :: cuda_version_lower_bound :: 12.0" \
+  -p "nvidia :: cuda_version_upper_bound :: 12.9" \
+  -o . --pyproject-toml python-package/pyproject.toml

From 02f9651d8909af0f813d86e89fb4d68fc8c137da Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 21 Aug 2025 16:50:43 +0800
Subject: [PATCH 145/224] Fix missing cub checks and streams. (#11642)

---
 src/common/ranking_utils.cu                   | 12 +++---
 src/common/stats.cu                           |  5 ++-
 src/metric/rank_metric.cu                     | 19 +++++----
 src/objective/lambdarank_obj.cu               | 28 +++++++------
 src/tree/gpu_hist/evaluate_splits.cu          | 42 +++++++++----------
 src/tree/gpu_hist/evaluate_splits.cuh         | 11 +++--
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu |  4 +-
 7 files changed, 61 insertions(+), 60 deletions(-)

diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index c67af5571be1..3aa1a2c54762 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -61,13 +61,13 @@ void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels
 
   CHECK(out_dcg.Contiguous());
   std::size_t bytes;
-  cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
-                                  d_group_ptr.size() - 1, d_group_ptr.data(),
-                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
+                                                d_group_ptr.size() - 1, d_group_ptr.data(),
+                                                d_group_ptr.data() + 1, ctx->CUDACtx()->Stream()));
   dh::TemporaryArray<char> temp(bytes);
-  cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
-                                  d_group_ptr.size() - 1, d_group_ptr.data(),
-                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
+      temp.data().get(), bytes, value_it, out_dcg.Values().data(), d_group_ptr.size() - 1,
+      d_group_ptr.data(), d_group_ptr.data() + 1, ctx->CUDACtx()->Stream()));
 }
 
 void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
diff --git a/src/common/stats.cu b/src/common/stats.cu
index 8f8faf1e0fd4..3986ed9f553f 100644
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -55,9 +55,10 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
   std::size_t bytes;
   CHECK_EQ(out.Size(), 1);
   auto s = ctx->CUDACtx()->Stream();
-  cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size(), s);
+  dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size(), s));
   dh::TemporaryArray<char> temp{bytes};
-  cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
+  dh::safe_cuda(
+      cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s));
 }
 
 void SampleMean(Context const* ctx, bool is_column_split, linalg::MatrixView<float const> d_v,
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index d43125dcbaf7..e1f9a6a73be5 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -62,11 +62,12 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
   thrust::fill_n(cuctx->CTP(), pre.data(), pre.size(), 0.0);
 
   std::size_t bytes;
-  cub::DeviceSegmentedReduce::Sum(nullptr, bytes, it, pre.data(), p_cache->Groups(), d_gptr.data(),
-                                  d_gptr.data() + 1, cuctx->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, bytes, it, pre.data(), p_cache->Groups(),
+                                                d_gptr.data(), d_gptr.data() + 1, cuctx->Stream()));
   dh::TemporaryArray<char> temp(bytes);
-  cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, it, pre.data(), p_cache->Groups(),
-                                  d_gptr.data(), d_gptr.data() + 1, cuctx->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, it, pre.data(),
+                                                p_cache->Groups(), d_gptr.data(), d_gptr.data() + 1,
+                                                cuctx->Stream()));
 
   auto w_it =
       dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul),
@@ -166,11 +167,13 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
         });
 
     std::size_t bytes;
-    cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(), p_cache->Groups(),
-                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(),
+                                                  p_cache->Groups(), d_group_ptr.data(),
+                                                  d_group_ptr.data() + 1, cuctx->Stream()));
     dh::TemporaryArray<char> temp(bytes);
-    cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(), p_cache->Groups(),
-                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(),
+                                                  p_cache->Groups(), d_group_ptr.data(),
+                                                  d_group_ptr.data() + 1, cuctx->Stream()));
   }
 
   PackedReduceResult result{0.0, 0.0};
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index ecf826e4d1d4..f48d4a06eb81 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -63,11 +63,11 @@ void MinBias(Context const* ctx, std::shared_ptr<ltr::RankingCache> p_cache,
                                                     return std::abs(t_plus(i));
                                                   });
   std::size_t bytes;
-  cub::DeviceSegmentedReduce::Min(nullptr, bytes, val_it, d_min.data(), 2, key_it, key_it + 1,
-                                  cuctx->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Min(nullptr, bytes, val_it, d_min.data(), 2, key_it,
+                                                key_it + 1, cuctx->Stream()));
   dh::TemporaryArray<char> temp(bytes);
-  cub::DeviceSegmentedReduce::Min(temp.data().get(), bytes, val_it, d_min.data(), 2, key_it,
-                                  key_it + 1, cuctx->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Min(temp.data().get(), bytes, val_it, d_min.data(), 2,
+                                                key_it, key_it + 1, cuctx->Stream()));
 }
 
 /**
@@ -225,13 +225,13 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
   CHECK_EQ(n_groups * sizeof(GradCostNorm), d_max_lambdas.size_bytes());
   // Reduce by group.
   std::size_t bytes;
-  cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, d_max_lambdas.data(), n_groups,
-                                     d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1,
-                                     reduction_op, init, ctx->CUDACtx()->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(
+      nullptr, bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(),
+      d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream()));
   dh::TemporaryArray<char> temp(bytes);
-  cub::DeviceSegmentedReduce::Reduce(
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(
       temp.data().get(), bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(),
-      d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream());
+      d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream()));
 
   dh::TemporaryArray<double> min_bias(2);
   auto d_min_bias = dh::ToSpan(min_bias);
@@ -590,11 +590,13 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
 
   auto init = thrust::make_tuple(0.0, 0.0);
   std::size_t bytes;
-  cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, out_it, k, key_it, key_it + 1,
-                                     ReduceOp{}, init, ctx->CUDACtx()->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, out_it, k, key_it,
+                                                   key_it + 1, ReduceOp{}, init,
+                                                   ctx->CUDACtx()->Stream()));
   dh::TemporaryArray<char> temp(bytes);
-  cub::DeviceSegmentedReduce::Reduce(temp.data().get(), bytes, val_it, out_it, k, key_it,
-                                     key_it + 1, ReduceOp{}, init, ctx->CUDACtx()->Stream());
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(temp.data().get(), bytes, val_it, out_it, k,
+                                                   key_it, key_it + 1, ReduceOp{}, init,
+                                                   ctx->CUDACtx()->Stream()));
 
   thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), li.Size(),
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 63b3c1d37d05..e1d6d2e67d69 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -352,9 +352,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
 }
 
 void GPUHistEvaluator::LaunchEvaluateSplits(
-    bst_feature_t max_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs,
-    EvaluateSplitSharedInputs shared_inputs,
+    Context const *ctx, bst_feature_t max_active_features,
+    common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
     TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
     common::Span<DeviceSplitCandidate> out_splits) {
   if (need_sort_histogram_) {
@@ -367,28 +366,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
 
   // One block for each feature
   uint32_t constexpr kBlockThreads = 32;
-  dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads,
-                    0}(
-      EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
-      shared_inputs,
-      this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
-      evaluator, dh::ToSpan(feature_best_splits));
+  dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads, 0,  // NOLINT
+                   ctx->CUDACtx()->Stream()}(
+      EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs, shared_inputs,
+      this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), evaluator,
+      dh::ToSpan(feature_best_splits));
 
   // Reduce to get best candidate for left and right child over all features
-  auto reduce_offset =
-      dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
-                                        [=] __device__(size_t idx) -> size_t {
-                                          return idx * max_active_features;
-                                        });
+  auto reduce_offset = dh::MakeTransformIterator<size_t>(
+      thrust::make_counting_iterator(0llu),
+      [=] __device__(size_t idx) -> size_t { return idx * max_active_features; });
   size_t temp_storage_bytes = 0;
   auto num_segments = out_splits.size();
-  cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
-                                  out_splits.data(), num_segments, reduce_offset,
-                                  reduce_offset + 1);
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
+      nullptr, temp_storage_bytes, feature_best_splits.data(), out_splits.data(), num_segments,
+      reduce_offset, reduce_offset + 1, ctx->CUDACtx()->Stream()));
   dh::TemporaryArray<int8_t> temp(temp_storage_bytes);
-  cub::DeviceSegmentedReduce::Sum(temp.data().get(), temp_storage_bytes, feature_best_splits.data(),
-                                  out_splits.data(), num_segments, reduce_offset,
-                                  reduce_offset + 1);
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
+      temp.data().get(), temp_storage_bytes, feature_best_splits.data(), out_splits.data(),
+      num_segments, reduce_offset, reduce_offset + 1, ctx->CUDACtx()->Stream()));
 }
 
 void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
@@ -414,8 +410,8 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
 
   dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
   auto out_splits = dh::ToSpan(splits_out_storage);
-  this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
-                             evaluator, out_splits);
+  this->LaunchEvaluateSplits(ctx, max_active_features, d_inputs, shared_inputs, evaluator,
+                             out_splits);
 
   if (is_column_split_) {
     // With column-wise data split, we gather the split candidates from all the workers and find the
@@ -427,7 +423,7 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
         all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size());
     dh::safe_cuda(cudaMemcpyAsync(current_rank.data(), out_splits.data(),
                                   out_splits.size() * sizeof(DeviceSplitCandidate),
-                                  cudaMemcpyDeviceToDevice));
+                                  cudaMemcpyDeviceToDevice, ctx->CUDACtx()->Stream()));
     auto rc = collective::Allgather(
         ctx, linalg::MakeVec(all_candidates.data(), all_candidates.size(), ctx->Device()));
     collective::SafeColl(rc);
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index c4abfb11f736..b1c4f4e33343 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -183,12 +183,11 @@ class GPUHistEvaluator {
       TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
 
   // impl of evaluate splits, contains CUDA kernels so it's public
-  void LaunchEvaluateSplits(
-      bst_feature_t max_active_features,
-      common::Span<const EvaluateSplitInputs> d_inputs,
-      EvaluateSplitSharedInputs shared_inputs,
-      TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
-      common::Span<DeviceSplitCandidate> out_splits);
+  void LaunchEvaluateSplits(Context const *ctx, bst_feature_t max_active_features,
+                            common::Span<const EvaluateSplitInputs> d_inputs,
+                            EvaluateSplitSharedInputs shared_inputs,
+                            TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
+                            common::Span<DeviceSplitCandidate> out_splits);
   /**
    * \brief Evaluate splits for left and right nodes.
    */
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 7c2da9d243f9..cd6082f325bb 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -496,8 +496,8 @@ TEST(GpuHist, EvaluateSplits) {
                              FstCU()};
   dh::device_vector<EvaluateSplitInputs> inputs =
       std::vector<EvaluateSplitInputs>{input_left, input_right};
-  evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
-                                 evaluator.GetEvaluator(), dh::ToSpan(out_splits));
+  evaluator.LaunchEvaluateSplits(&ctx, input_left.feature_set.size(), dh::ToSpan(inputs),
+                                 shared_inputs, evaluator.GetEvaluator(), dh::ToSpan(out_splits));
 
   DeviceSplitCandidate result_left = out_splits[0];
   EXPECT_EQ(result_left.findex, 1);

From 35f1e8588ef296d1455dac43c6bb8f938a88e916 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 22 Aug 2025 11:07:12 +0800
Subject: [PATCH 146/224] Cleanup the all_of implementation using thrust
 reduce. (#11647)

---
 src/common/algorithm.cuh           | 20 +++++++++++++++++
 src/common/device_helpers.cuh      |  5 +++++
 src/data/device_adapter.cuh        | 20 +++++------------
 src/objective/regression_obj.cu    | 35 ++++++++++++++----------------
 tests/cpp/common/test_algorithm.cu | 34 +++++++++++++++++++++++------
 5 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index 4ea73d98a385..e0c1ebd11c25 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -326,5 +326,25 @@ void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out
   InclusiveScan(ctx, d_in, d_out, cub::Sum{}, num_items);
 #endif
 }
+
+/**
+ * @brief Customized version of @ref thrust::all_of
+ *
+ * @ref thrust::all_of uses small intervals for early stop. But we often use this function
+ * to perform checks on data and in most cases need to walk through the entire dataset
+ * (like all data point is valid). This function uses @ref thrust::reduce to avoid
+ * excessive kernel launches and synchronizations.
+ */
+template <typename Policy, typename InputIt, typename Chk>
+[[nodiscard]] std::enable_if_t<
+    std::is_same_v<bool,
+                   std::invoke_result_t<Chk, typename std::iterator_traits<InputIt>::value_type>>,
+    bool>
+AllOf(Policy policy, InputIt first, InputIt second, Chk &&check) {
+  auto n = std::distance(first, second);
+  auto it =
+      dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) { return check(first[i]); });
+  return dh::Reduce(policy, it, it + n, true, thrust::logical_and<>{});
+}
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 731b8398132b..558e540d7d42 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -567,6 +567,11 @@ XGBOOST_DEVICE thrust::transform_iterator<FuncT, IterT, ReturnT> MakeTransformIt
   return thrust::transform_iterator<FuncT, IterT, ReturnT>(iter, func);
 }
 
+template <typename Fn>
+XGBOOST_DEVICE auto MakeIndexTransformIter(Fn &&fn) {
+  return thrust::make_transform_iterator(thrust::make_counting_iterator(0ul), std::forward<Fn>(fn));
+}
+
 template <typename It>
 size_t XGBOOST_DEVICE SegmentId(It first, It last, size_t idx) {
   size_t segment_id = thrust::upper_bound(thrust::seq, first, last, idx) - 1 - first;
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 03067b077127..ed63892c5364 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -5,9 +5,7 @@
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
 
-#include <thrust/functional.h>                  // for maximum
-#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
-#include <thrust/logical.h>                     // for none_of
+#include <thrust/functional.h>                   // for maximum
 
 #include <cstddef>           // for size_t
 #include <cuda/std/variant>  // for variant
@@ -15,6 +13,7 @@
 #include <memory>            // for make_unique
 #include <string>            // for string
 
+#include "../common/algorithm.cuh"  // for AllOf
 #include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
 #include "adapter.h"
@@ -229,25 +228,18 @@ bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
 }
 
 /**
- * \brief Check there's no inf in data.
+ * @brief Check there's no inf in data.
  */
 template <typename AdapterBatchT>
 bool NoInfInData(Context const* ctx, AdapterBatchT const& batch, IsValidFunctor is_valid) {
-  auto counting = thrust::make_counting_iterator(0llu);
-  auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
-    auto v = batch.GetElement(idx).value;
+  auto it = dh::MakeIndexTransformIter(
+      [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
+  return common::AllOf(ctx->CUDACtx()->CTP(), it, it + batch.Size(), [=] XGBOOST_DEVICE(float v) {
     if (is_valid(v) && isinf(v)) {
       return false;
     }
     return true;
   });
-  // The default implementation in thrust optimizes any_of/none_of/all_of by using small
-  // intervals to early stop. But we expect all data to be valid here, using small
-  // intervals only decreases performance due to excessive kernel launch and stream
-  // synchronization.
-  auto valid = dh::Reduce(ctx->CUDACtx()->CTP(), value_iter, value_iter + batch.Size(), true,
-                          thrust::logical_and<>{});
-  return valid;
 }
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_DEVICE_ADAPTER_H_
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index a2d903c8a951..bed6e63da5d9 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -6,10 +6,10 @@
  */
 #include <dmlc/omp.h>
 
-#include <algorithm>
+#include <algorithm>  // for all_of
 #include <cmath>
-#include <cstdint>  // std::int32_t
-#include <vector>
+#include <cstdint>  // for  int32_t
+#include <vector>   // for vector
 
 #include "../common/common.h"
 #include "../common/linalg_op.h"
@@ -22,6 +22,7 @@
 #include "./regression_loss.h"
 #include "adaptive.h"
 #include "init_estimation.h"  // FitIntercept
+#include "regression_param.h"
 #include "xgboost/base.h"
 #include "xgboost/context.h"  // Context
 #include "xgboost/data.h"     // MetaInfo
@@ -34,11 +35,10 @@
 #include "xgboost/span.h"
 #include "xgboost/tree_model.h"  // RegTree
 
-#include "regression_param.h"
-
 #if defined(XGBOOST_USE_CUDA)
-#include "../common/cuda_context.cuh"  // for CUDAContext
-#include "../common/device_helpers.cuh"
+#include "../common/algorithm.cuh"       // for AllOf
+#include "../common/cuda_context.cuh"    // for CUDAContext
+#include "../common/device_helpers.cuh"  // for MakeIndexTransformIter
 #include "../common/linalg_op.cuh"
 #endif  // defined(XGBOOST_USE_CUDA)
 
@@ -48,7 +48,7 @@
 
 namespace xgboost::obj {
 namespace {
-void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
+void CheckRegInputs(MetaInfo const& info, HostDeviceVector<float> const& preds) {
   CheckInitInputs(info);
   CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
 }
@@ -63,13 +63,12 @@ void ValidateLabel(Context const* ctx, MetaInfo const& info) {
       },
       [&] {
 #if defined(XGBOOST_USE_CUDA)
-        auto cuctx = ctx->CUDACtx();
-        auto it = dh::MakeTransformIterator<bool>(
-            thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
-              auto [m, n] = linalg::UnravelIndex(i, label.Shape());
-              return Loss::CheckLabel(label(m, n));
-            });
-        return dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
+        auto it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) -> float {
+          auto [m, n] = linalg::UnravelIndex(i, label.Shape());
+          return label(m, n);
+        });
+        return common::AllOf(ctx->CUDACtx()->CTP(), it, it + label.Size(),
+                             [] XGBOOST_DEVICE(float y) { return Loss::CheckLabel(y); });
 #else
         common::AssertGPUSupport();
         return false;
@@ -382,9 +381,7 @@ struct PoissonRegressionParam : public XGBoostParameter<PoissonRegressionParam>
 class PoissonRegression : public FitInterceptGlmLike {
  public:
   // declare functions
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-  }
+  void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
 
   [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
 
@@ -583,7 +580,7 @@ struct TweedieRegressionParam : public XGBoostParameter<TweedieRegressionParam>
 class TweedieRegression : public FitInterceptGlmLike {
  public:
   // declare functions
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+  void Configure(Args const& args) override {
     param_.UpdateAllowUnknown(args);
     std::ostringstream os;
     os << "tweedie-nloglik@" << param_.tweedie_variance_power;
diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu
index 8f857ff50dda..0db737ad9f27 100644
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>      // copy
@@ -13,8 +13,7 @@
 #include "../../../src/common/device_helpers.cuh"
 #include "../helpers.h"  // MakeCUDACtx
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 void TestSegmentedArgSort() {
   auto ctx = MakeCUDACtx(0);
 
@@ -41,8 +40,8 @@ void TestSegmentedArgSort() {
   thrust::copy(sorted_idx.begin(), sorted_idx.end(), h_sorted_index.begin());
 
   for (size_t i = 1; i < kGroups + 1; ++i) {
-    auto group_sorted_idx = common::Span<size_t>(h_sorted_index)
-                                .subspan(offset_ptr[i - 1], offset_ptr[i] - offset_ptr[i - 1]);
+    auto group_sorted_idx =
+        Span<size_t>(h_sorted_index).subspan(offset_ptr[i - 1], offset_ptr[i] - offset_ptr[i - 1]);
     ASSERT_TRUE(std::is_sorted(group_sorted_idx.begin(), group_sorted_idx.end(), std::greater<>{}));
     ASSERT_EQ(group_sorted_idx.back(), 0);
     for (auto j : group_sorted_idx) {
@@ -91,5 +90,26 @@ TEST(Algorithm, SegmentedSequence) {
   ASSERT_EQ(idx[3], 3);
   ASSERT_EQ(idx[15], 11);
 }
-}  // namespace common
-}  // namespace xgboost
+
+namespace {
+void TestAllOf(std::size_t n) {
+  auto ctx = MakeCUDACtx(0);
+  dh::device_vector<double> values(n);
+  dh::Iota(dh::ToSpan(values), ctx.CUDACtx()->Stream());
+  EXPECT_TRUE(AllOf(ctx.CUDACtx()->CTP(), values.cbegin(), values.cend(),
+                    [n] XGBOOST_DEVICE(double v) { return v < n; }));
+  if (n == 0) {
+    return;
+  }
+  EXPECT_FALSE(AllOf(ctx.CUDACtx()->CTP(), values.cbegin(), values.cend(),
+                     [n] XGBOOST_DEVICE(double v) { return v < n && v > 0; }));
+}
+}  // namespace
+
+TEST(Algorithm, AllOf) {
+  TestAllOf(0);
+  TestAllOf(1);
+  TestAllOf(2);
+  TestAllOf(4096);
+}
+}  // namespace xgboost::common

From a8924d4d5749b0b882ee915be030baf3565a15ca Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 23 Aug 2025 08:48:39 +0800
Subject: [PATCH 147/224] [enc][doc] Clarify R workaround. (#11650)

---
 doc/tutorials/categorical.rst | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index 03a3bbcc16d9..feb9a90d3ef6 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -231,6 +231,44 @@ dataframes. During training continuation, XGBoost will either extract the catego
 the previous model or use the categories from the new training dataset if the input model
 doesn't have the information.
 
+For R, the auto-recoding is not yet supported as of 3.1. To provide an example:
+
+.. code-block:: R
+
+    > f0 = factor(c("a", "b", "c"))
+    > as.numeric(f0)
+    [1] 1 2 3
+    > f0
+    [1] a b c
+    Levels: a b c
+
+In the above snippet, we have the mapping: ``a -> 1, b -> 2, c -> 3``. Assuming the above
+is the training data, and the next snippet is the test data:
+
+.. code-block:: R
+
+    > f1 = factor(c("a", "c"))
+    > as.numeric(f1)
+    [1] 1 2
+    > f1
+    [1] a c
+    Levels: a c
+
+
+Now, we have ``a -> 1, c -> 2`` because ``b`` is missing, and the R factor encodes the data
+differently, resulting in invalid test-time encoding. XGBoost cannot remember the original
+encoding for the R package. You will have to encode the data explicitly during inference:
+
+.. code-block:: R
+
+    > f1 = factor(c("a", "c"), levels = c("a", "b", "c"))
+    > f1
+    [1] a c
+    Levels: a b c
+    > as.numeric(f1)
+      [1] 1 3
+
+
 *************
 Miscellaneous
 *************

From 201569e2572af7cc921c7ec9427d08d34ed53ef8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 24 Aug 2025 01:23:28 +0800
Subject: [PATCH 148/224] Small cleanup for the objective interface and
 intercept document. (#11649)

- Clarify the code comments.
- Add test for the effect of intercept and base margin.
---
 doc/parameter.rst                          |  2 +-
 doc/tutorials/external_memory.rst          |  4 +-
 doc/tutorials/intercept.rst                | 56 ++++++++++++++-
 include/xgboost/learner.h                  | 29 ++++----
 include/xgboost/objective.h                | 82 ++++++++++++----------
 python-package/xgboost/testing/predict.py  | 31 ++++++++
 src/learner.cc                             | 38 ++++------
 tests/cpp/objective/test_lambdarank_obj.cc |  2 +-
 tests/cpp/objective/test_multiclass_obj.cc |  7 +-
 tests/cpp/objective/test_regression_obj.cc |  7 +-
 tests/python-gpu/test_gpu_prediction.py    |  6 +-
 tests/python/test_predict.py               |  6 +-
 12 files changed, 179 insertions(+), 91 deletions(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 0125dbdae9d1..0e518e1b2f81 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -419,7 +419,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
 * ``base_score``
 
-  - The initial prediction score of all instances, global bias
+  - The initial prediction score of all instances, global bias.
   - The parameter is automatically estimated for selected objectives before training. To
     disable the estimation, specify a real number argument.
   - If ``base_margin`` is supplied, ``base_score`` will not be added.
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 0adc06d79afb..030e6b841dbc 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -274,9 +274,7 @@ floating point samples, `512` features (total 1TB) on a GH200 (a H200 GPU connec
 Grace CPU by a chip-to-chip link) system. One can start with:
 - Evenly divide the data into 128 batches with 8GB per batch.
 - Define a custom iterator as previously described.
-- Set the `max_quantile_batches` parameter of the
-  :py:class:`~xgboost.ExtMemQuantileDMatrix` to 32 (256GB per sub-stream for
-  quantization). Load the data.
+- Set the `max_quantile_batches` parameter of the :py:class:`~xgboost.ExtMemQuantileDMatrix` to 32 (256GB per sub-stream for quantization). Load the data.
 - Start training with ``device=cuda``.
 
 To run experiments on these platforms, the open source `NVIDIA Linux driver
diff --git a/doc/tutorials/intercept.rst b/doc/tutorials/intercept.rst
index 9e238f2cf86c..122f2313e991 100644
--- a/doc/tutorials/intercept.rst
+++ b/doc/tutorials/intercept.rst
@@ -136,4 +136,58 @@ We have:
    E[c_i] &= \exp{(F(x_i) + \ln{\gamma_i})} \\
    E[c_i] &= g^{-1}(F(x_i) + g(\gamma_i))
 
-As you can see, we can use the ``base_margin`` for modeling with offset similar to GLMs
\ No newline at end of file
+As you can see, we can use the ``base_margin`` for modeling with offset similar to GLMs
+
+*******
+Example
+*******
+
+The following example shows the relationship between ``base_score`` and ``base_margin``
+using binary logistic with a `logit` link function:
+
+.. code-block:: python
+
+    import numpy as np
+    from scipy.special import logit
+    from sklearn.datasets import make_classification
+    from xgboost import train, DMatrix
+
+    X, y = make_classification(random_state=2025)
+
+The intercept is a valid probability (0.5). It's used as the initial estimation of the
+probability of obtaining a positive sample.
+
+.. code-block:: python
+
+    intercept = 0.5
+
+First we use the intercept to train a model:
+
+.. code-block:: python
+
+    booster = train(
+        {"base_score": intercept, "objective": "binary:logistic"},
+        dtrain=DMatrix(X, y),
+        num_boost_round=1,
+    )
+    predt_0 = booster.predict(DMatrix(X, y))
+
+Apply :py:func:`~scipy.special.logit` to obtain the "margin":
+
+.. code-block:: python
+
+    margin = np.full(y.shape, fill_value=logit(intercept), dtype=np.float32)
+    Xy = DMatrix(X, y, base_margin=margin)
+    # 0.2 is a dummy value to show that `base_margin` overrides `base_score`.
+    booster = train(
+        {"base_score": 0.2, "objective": "binary:logistic"},
+        dtrain=Xy,
+        num_boost_round=1,
+    )
+    predt_1 = booster.predict(Xy)
+
+Compare the results:
+
+.. code-block:: python
+
+    np.testing.assert_allclose(predt_0, predt_1)
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index e9c8ad1ccccd..6f464c03b3aa 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -11,7 +11,7 @@
 #include <dmlc/io.h>          // for Serializable
 #include <xgboost/base.h>     // for bst_feature_t, bst_target_t, bst_float, Args, GradientPair, ..
 #include <xgboost/context.h>  // for Context
-#include <xgboost/linalg.h>   // for Tensor, TensorView
+#include <xgboost/linalg.h>   // for Vector, VectorView
 #include <xgboost/metric.h>   // for Metric
 #include <xgboost/model.h>    // for Configurable, Model
 #include <xgboost/span.h>     // for Span
@@ -284,7 +284,7 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
 struct LearnerModelParamLegacy;
 
 /**
- * \brief Strategy for building multi-target models.
+ * @brief Strategy for building multi-target models.
  */
 enum class MultiStrategy : std::int32_t {
   kOneOutputPerTree = 0,
@@ -292,31 +292,34 @@ enum class MultiStrategy : std::int32_t {
 };
 
 /**
- * \brief Basic model parameters, used to describe the booster.
+ * @brief Basic model parameters, used to describe the booster.
  */
 struct LearnerModelParam {
  private:
   /**
-   * \brief Global bias, this is just a scalar value but can be extended to vector when we
+   * @brief Global bias, this is just a scalar value but can be extended to vector when we
    *        support multi-class and multi-target.
+   *
+   * The value stored here is the value before applying the inverse link function, used
+   * for initializing the prediction matrix/vector.
    */
-  linalg::Tensor<float, 1> base_score_;
+  linalg::Vector<float> base_score_;
 
  public:
   /**
-   * \brief The number of features.
+   * @brief The number of features.
    */
   bst_feature_t num_feature{0};
   /**
-   * \brief The number of classes or targets.
+   * @brief The number of classes or targets.
    */
   std::uint32_t num_output_group{0};
   /**
-   * \brief Current task, determined by objective.
+   * @brief Current task, determined by objective.
    */
   ObjInfo task{ObjInfo::kRegression};
   /**
-   * \brief Strategy for building multi-target models.
+   * @brief Strategy for building multi-target models.
    */
   MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
 
@@ -324,18 +327,18 @@ struct LearnerModelParam {
   // As the old `LearnerModelParamLegacy` is still used by binary IO, we keep
   // this one as an immutable copy.
   LearnerModelParam(Context const* ctx, LearnerModelParamLegacy const& user_param,
-                    linalg::Tensor<float, 1> base_margin, ObjInfo t, MultiStrategy multi_strategy);
+                    linalg::Vector<float> base_score, ObjInfo t, MultiStrategy multi_strategy);
   LearnerModelParam(LearnerModelParamLegacy const& user_param, ObjInfo t,
                     MultiStrategy multi_strategy);
-  LearnerModelParam(bst_feature_t n_features, linalg::Tensor<float, 1> base_score,
+  LearnerModelParam(bst_feature_t n_features, linalg::Vector<float> base_score,
                     std::uint32_t n_groups, bst_target_t n_targets, MultiStrategy multi_strategy)
       : base_score_{std::move(base_score)},
         num_feature{n_features},
         num_output_group{std::max(n_groups, n_targets)},
         multi_strategy{multi_strategy} {}
 
-  linalg::TensorView<float const, 1> BaseScore(Context const* ctx) const;
-  [[nodiscard]] linalg::TensorView<float const, 1> BaseScore(DeviceOrd device) const;
+  linalg::VectorView<float const> BaseScore(Context const* ctx) const;
+  [[nodiscard]] linalg::VectorView<float const> BaseScore(DeviceOrd device) const;
 
   void Copy(LearnerModelParam const& that);
   [[nodiscard]] bool IsVectorLeaf() const noexcept {
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index 41f10f09e6c3..51cd053cd600 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -1,8 +1,8 @@
 /**
- * Copyright 2014-2024, XGBoost Contributors
- * \file objective.h
- * \brief interface of objective function used by xgboost.
- * \author Tianqi Chen, Kailong Chen
+ * Copyright 2014-2025, XGBoost Contributors
+ *
+ * @brief interface of objective function used by xgboost.
+ * @author Tianqi Chen, Kailong Chen
  */
 #ifndef XGBOOST_OBJECTIVE_H_
 #define XGBOOST_OBJECTIVE_H_
@@ -11,19 +11,20 @@
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
+#include <xgboost/linalg.h>  // for Vector
 #include <xgboost/model.h>
 #include <xgboost/task.h>
 
-#include <cstdint>  // std::int32_t
+#include <cstdint>  // for int32_t
 #include <functional>
-#include <string>
+#include <string>  // for string
 
 namespace xgboost {
 
 class RegTree;
 struct Context;
 
-/*! \brief interface of objective function */
+/** @brief The interface of objective function */
 class ObjFunction : public Configurable {
  protected:
   Context const* ctx_;
@@ -32,32 +33,30 @@ class ObjFunction : public Configurable {
   static constexpr float DefaultBaseScore() { return 0.5f; }
 
  public:
-  /*! \brief virtual destructor */
   ~ObjFunction() override = default;
-  /*!
-   * \brief Configure the objective with the specified parameters.
-   * \param args arguments to the objective function.
+  /**
+   * @brief Configure the objective with the specified parameters.
+   *
+   * @param args arguments to the objective function.
    */
   virtual void Configure(Args const& args) = 0;
   /**
    * @brief Get gradient over each of predictions, given existing information.
    *
-   * @param preds prediction of current round
-   * @param info information about labels, weights, groups in rank
+   * @param preds Raw prediction (before applying the inverse link) of the current round.
+   * @param info information about labels, weights, groups in rank.
    * @param iteration current iteration number.
    * @param out_gpair output of get gradient, saves gradient and second order gradient in
    */
   virtual void GetGradient(HostDeviceVector<float> const& preds, MetaInfo const& info,
                            std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) = 0;
 
-  /*! \return the default evaluation metric for the objective */
-  virtual const char* DefaultEvalMetric() const = 0;
+  /** @return the default evaluation metric for the objective */
+  [[nodiscard]] virtual const char* DefaultEvalMetric() const = 0;
   /**
-   * \brief Return the configuration for the default metric.
+   * @brief Return the configuration for the default metric.
    */
-  virtual Json DefaultMetricConfig() const { return Json{Null{}}; }
-
-  // the following functions are optional, most of time default implementation is good enough
+  [[nodiscard]] virtual Json DefaultMetricConfig() const { return Json{Null{}}; }
   /**
    * @brief Apply inverse link (activation) function to prediction values.
    *
@@ -75,25 +74,28 @@ class ObjFunction : public Configurable {
    */
   virtual void EvalTransform(HostDeviceVector<float>* io_preds) { this->PredTransform(io_preds); }
   /**
-   * @brief Apply link function to the intercept.
+   * @brief Apply the link function to the intercept.
    *
-   *   This is used to transform user-set base_score back to margin used by gradient
-   *   boosting
+   *   This is an inverse of `PredTransform` for most of the objectives (if there's a
+   *   valid inverse). It's used to transform user-set base_score back to margin used by
+   *   gradient boosting. The method converts objective-based valid outputs like
+   *   probability back to raw model outputs.
    *
    * @return transformed value
    */
   [[nodiscard]] virtual float ProbToMargin(float base_score) const { return base_score; }
   /**
-   * @brief Obtain the initial estimation of prediction.
+   * @brief Obtain the initial estimation of prediction (intercept).
    *
-   *   The output in `base_score` represents prediction after apply the inverse link function.
+   *   The output in `base_score` represents prediction after apply the inverse link function
+   *   (valid prediction instead of raw).
    *
    * @param info MetaInfo that contains label.
    * @param base_score Output estimation.
    */
-  virtual void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) const;
-  /*!
-   * \brief Return task of this objective.
+  virtual void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const;
+  /**
+   * @brief Return task of this objective.
    */
   [[nodiscard]] virtual struct ObjInfo Task() const = 0;
   /**
@@ -106,31 +108,33 @@ class ObjFunction : public Configurable {
     }
     return 1;
   }
+  /** @brief Getter of the context. */
+  [[nodiscard]] Context const* Ctx() const { return this->ctx_; }
 
   /**
-   * \brief Update the leaf values after a tree is built. Needed for objectives with 0
+   * @brief Update the leaf values after a tree is built. Needed for objectives with 0
    *        hessian.
    *
    *   Note that the leaf update is not well defined for distributed training as XGBoost
    *   computes only an average of quantile between workers. This breaks when some leaf
    *   have no sample assigned in a local worker.
    *
-   * \param position The leaf index for each rows.
-   * \param info MetaInfo providing labels and weights.
-   * \param learning_rate The learning rate for current iteration.
-   * \param prediction Model prediction after transformation.
-   * \param group_idx The group index for this tree, 0 when it's not multi-target or multi-class.
-   * \param p_tree Tree that needs to be updated.
+   * @param position The leaf index for each rows.
+   * @param info MetaInfo providing labels and weights.
+   * @param learning_rate The learning rate for current iteration.
+   * @param prediction Model prediction after transformation.
+   * @param group_idx The group index for this tree, 0 when it's not multi-target or multi-class.
+   * @param p_tree Tree that needs to be updated.
    */
   virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& /*position*/,
                               MetaInfo const& /*info*/, float /*learning_rate*/,
                               HostDeviceVector<float> const& /*prediction*/,
                               std::int32_t /*group_idx*/, RegTree* /*p_tree*/) const {}
-
-  /*!
-   * \brief Create an objective function according to name.
-   * \param ctx  Pointer to runtime parameters.
-   * \param name Name of the objective.
+  /**
+   * @brief Create an objective function according to the name.
+   *
+   * @param name Name of the objective.
+   * @param ctx  Pointer to the context.
    */
   static ObjFunction* Create(const std::string& name, Context const* ctx);
 };
diff --git a/python-package/xgboost/testing/predict.py b/python-package/xgboost/testing/predict.py
index d75164bba00e..7ff212eff2fd 100644
--- a/python-package/xgboost/testing/predict.py
+++ b/python-package/xgboost/testing/predict.py
@@ -3,10 +3,12 @@
 from typing import Type
 
 import numpy as np
+from scipy.special import logit  # pylint: disable=no-name-in-module
 
 from ..core import DMatrix
 from ..training import train
 from .shared import validate_leaf_output
+from .updater import get_basescore
 from .utils import Device
 
 
@@ -63,3 +65,32 @@ def run_predict_leaf(device: Device, DMatrixT: Type[DMatrix]) -> np.ndarray:
     assert booster.predict(m, pred_leaf=True).shape == (rows,)
 
     return leaf
+
+
+def run_base_margin_vs_base_score(device: Device) -> None:
+    """Test for the relation between score and margin."""
+    from sklearn.datasets import make_classification
+
+    intercept = 0.5
+
+    X, y = make_classification(random_state=2025)
+    booster = train(
+        {"base_score": intercept, "objective": "binary:logistic", "device": device},
+        dtrain=DMatrix(X, y),
+        num_boost_round=1,
+    )
+    np.testing.assert_allclose(get_basescore(booster), intercept)
+    predt_0 = booster.predict(DMatrix(X, y))
+
+    margin = np.full(y.shape, fill_value=logit(intercept), dtype=np.float32)
+    Xy = DMatrix(X, y, base_margin=margin)
+    # 0.2 is a dummy value
+    booster = train(
+        {"base_score": 0.2, "objective": "binary:logistic", "device": device},
+        dtrain=Xy,
+        num_boost_round=1,
+    )
+    np.testing.assert_allclose(get_basescore(booster), 0.2)
+    predt_1 = booster.predict(Xy)
+
+    np.testing.assert_allclose(predt_0, predt_1)
diff --git a/src/learner.cc b/src/learner.cc
index 5aebe4bd55c8..084cece3bd1e 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -48,7 +48,7 @@
 #include "xgboost/global_config.h"        // for GlobalConfiguration, GlobalConfigThreadLocalStore
 #include "xgboost/host_device_vector.h"   // for HostDeviceVector
 #include "xgboost/json.h"                 // for Json, get, Object, String, IsA, Array, ToJson
-#include "xgboost/linalg.h"               // for Tensor, TensorView
+#include "xgboost/linalg.h"               // for Vector, VectorView
 #include "xgboost/logging.h"              // for CHECK, LOG, CHECK_EQ
 #include "xgboost/metric.h"               // for Metric
 #include "xgboost/objective.h"            // for ObjFunction
@@ -78,7 +78,7 @@ T& UsePtr(T& ptr) {  // NOLINT
 /*! \brief training parameter for regression
  *
  * Should be deprecated, but still used for being compatible with binary IO.
- * Once it's gone, `LearnerModelParam` should handle transforming `base_margin`
+ * Once it's gone, `LearnerModelParam` should handle transforming `base_score`
  * with objective by itself.
  */
 struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy> {
@@ -237,10 +237,10 @@ LearnerModelParam::LearnerModelParam(LearnerModelParamLegacy const& user_param,
 }
 
 LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy const& user_param,
-                                     linalg::Tensor<float, 1> base_margin, ObjInfo t,
+                                     linalg::Vector<float> base_score, ObjInfo t,
                                      MultiStrategy multi_strategy)
     : LearnerModelParam{user_param, t, multi_strategy} {
-  std::swap(base_score_, base_margin);
+  std::swap(base_score_, base_score);
   // Make sure read access everywhere for thread-safe prediction.
   std::as_const(base_score_).HostView();
   if (ctx->IsCUDA()) {
@@ -249,7 +249,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
   CHECK(std::as_const(base_score_).Data()->HostCanRead());
 }
 
-linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
+linalg::VectorView<float const> LearnerModelParam::BaseScore(DeviceOrd device) const {
   // multi-class is not yet supported.
   CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
   if (!device.IsCUDA()) {
@@ -264,7 +264,7 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device
   return v;
 }
 
-linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* ctx) const {
+linalg::VectorView<float const> LearnerModelParam::BaseScore(Context const* ctx) const {
   return this->BaseScore(ctx->Device());
 }
 
@@ -353,45 +353,35 @@ class LearnerConfiguration : public Learner {
     this->ConfigureTargets();
 
     auto task = UsePtr(obj_)->Task();
-    linalg::Tensor<float, 1> base_score({1}, Ctx()->Device());
+    linalg::Vector<float> base_score({1}, Ctx()->Device());
     auto h_base_score = base_score.HostView();
 
-    // transform to margin
+    // Transform to margin. (apply the link function)
     h_base_score(0) = obj_->ProbToMargin(mparam_.base_score);
     CHECK(tparam_.GetInitialised());
     // move it to model param, which is shared with all other components.
     learner_model_param_ =
-        LearnerModelParam(Ctx(), mparam_, std::move(base_score), task, tparam_.multi_strategy);
+        LearnerModelParam{Ctx(), mparam_, std::move(base_score), task, tparam_.multi_strategy};
     CHECK(learner_model_param_.Initialized());
     CHECK_NE(learner_model_param_.BaseScore(Ctx()).Size(), 0);
   }
   /**
-   * \brief Calculate the `base_score` based on input data.
+   * @brief Calculate the `base_score` based on input data.
    *
-   * \param p_fmat The training DMatrix used to estimate the base score.
+   * @param p_fmat The training DMatrix used to estimate the base score.
    */
   void InitBaseScore(DMatrix const* p_fmat) {
-    // Before 1.0.0, we save `base_score` into binary as a transformed value by objective.
-    // After 1.0.0 we save the value provided by user and keep it immutable instead.  To
-    // keep the stability, we initialize it in binary LoadModel instead of configuration.
-    // Under what condition should we omit the transformation:
-    //
-    // - base_score is loaded from old binary model.
-    //
-    // What are the other possible conditions:
-    //
-    // - model loaded from new binary or JSON.
-    // - model is created from scratch.
-    // - model is configured second time due to change of parameter
     if (!learner_model_param_.Initialized()) {
       this->ConfigureModelParamWithoutBaseScore();
     }
+
     if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
+      // The DMatrix can be null if a method that's not training is called.
       if (p_fmat) {
         auto const& info = p_fmat->Info();
         info.Validate(Ctx()->Device());
         // We estimate it from input data.
-        linalg::Tensor<float, 1> base_score;
+        linalg::Vector<float> base_score;
         this->InitEstimation(info, &base_score);
         CHECK_EQ(base_score.Size(), 1);
         mparam_.base_score = base_score(0);
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 7d1639e4fb79..c5a58093d73f 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -120,7 +120,7 @@ void TestNDCGGPair(Context const* ctx) {
     obj->GetGradient(predts, info, 0, &gpairs);
     ASSERT_EQ(gpairs.Size(), 0);
   }
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
+  ASSERT_NO_THROW({ [[maybe_unused]] auto _ = obj->DefaultEvalMetric(); });
 }
 
 TEST(LambdaRank, NDCGGPair) {
diff --git a/tests/cpp/objective/test_multiclass_obj.cc b/tests/cpp/objective/test_multiclass_obj.cc
index 734e097b846b..ae3427b61398 100644
--- a/tests/cpp/objective/test_multiclass_obj.cc
+++ b/tests/cpp/objective/test_multiclass_obj.cc
@@ -1,9 +1,8 @@
-/*!
- * Copyright 2018-2023 XGBoost contributors
+/**
+ * Copyright 2018-2025, XGBoost contributors
  */
 #include <xgboost/objective.h>
 #include <xgboost/context.h>
-#include "../../src/common/common.h"
 #include "../helpers.h"
 #include "test_multiclass_obj.h"
 
@@ -32,7 +31,7 @@ void TestSoftmaxMultiClassObjGPair(const Context* ctx) {
 		   {0.24f, -0.91f, 0.66f, -0.33f, 0.09f, 0.24f}, // grad
 		   {0.36f, 0.16f, 0.44f, 0.45f, 0.16f, 0.37f});	 // hess
 
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
+  ASSERT_NO_THROW({ [[maybe_unused]] auto _ = obj->DefaultEvalMetric(); });
 }
 
 void TestSoftmaxMultiClassBasic(const Context* ctx) {
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index 2cb57a066391..4218cf56e960 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
@@ -9,12 +9,12 @@
 #include <numeric>  // for iota
 
 #include "../../../src/common/linalg_op.h"  // for begin, end
-#include "../../../src/objective/adaptive.h"
 #include "../../../src/tree/param.h"        // for TrainParam
 #include "../helpers.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/linalg.h"
+#include "xgboost/tree_model.h"  // for RegTree
 
 #include "test_regression_obj.h"
 
@@ -39,7 +39,8 @@ void TestLinearRegressionGPair(const Context* ctx) {
                    {},  // empty weight
                    {0, 0.1f, 0.9f, 1.0f, -1.0f, -0.9f, -0.1f, 0},
                    {1,   1,   1,   1,    1,    1,    1, 1});
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
+
+  ASSERT_NO_THROW({ [[maybe_unused]] auto _ = obj->DefaultEvalMetric(); });
 }
 
 void TestSquaredLog(const Context* ctx) {
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 6f8a38f3fa02..38249fc3247d 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -9,7 +9,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.predict import run_predict_leaf
+from xgboost.testing.predict import run_base_margin_vs_base_score, run_predict_leaf
 
 sys.path.append("tests/python")
 
@@ -623,3 +623,7 @@ def test_dtypes(self):
             X = cp.array(orig, dtype=dtype)
             with pytest.raises(ValueError):
                 booster.inplace_predict(X)
+
+
+def test_base_margin_vs_base_score() -> None:
+    run_base_margin_vs_base_score("cuda")
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index 23b379467d25..a7de6dd76883 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -11,7 +11,7 @@
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.testing.data import np_dtypes, pd_dtypes
-from xgboost.testing.predict import run_predict_leaf
+from xgboost.testing.predict import run_base_margin_vs_base_score, run_predict_leaf
 
 
 def run_threaded_predict(X, rows, predict_func):
@@ -74,6 +74,10 @@ def test_predict_shape():
     assert interaction.shape[3] == X.shape[1] + 1
 
 
+def test_base_margin_vs_base_score() -> None:
+    run_base_margin_vs_base_score("cpu")
+
+
 class TestInplacePredict:
     """Tests for running inplace prediction"""
 

From ffb27253f38c78676a227dd6d256d5a0ac286f66 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Sun, 24 Aug 2025 00:51:28 -0700
Subject: [PATCH 149/224] Add an option to test new VM image from
 dmlc/xgboost-devops#39

---
 .github/runs-on.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index 1d97b8c5de21..346ed83dacad 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -6,6 +6,11 @@ images:
     arch: "x64"
     owner: "492475357299"  # XGBooost CI
     name: "xgboost-ci-runs-on-linux-amd64-*"
+  new-linux-amd64:
+    platform: "linux"
+    arch: "x64"
+    owner: "492475357299"  # XGBooost CI
+    name: "new-xgboost-ci-runs-on-linux-amd64-*"
   linux-arm64:
     platform: "linux"
     arch: "arm64"
@@ -22,14 +27,26 @@ runners:
     cpu: 16
     family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"]
     image: linux-amd64
+  new-linux-amd64-cpu:
+    cpu: 16
+    family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"]
+    image: new-linux-amd64
   linux-amd64-gpu:
     family: ["g4dn.xlarge"]
     image: linux-amd64
     spot: "false"
+  new-linux-amd64-gpu:
+    family: ["g4dn.xlarge"]
+    image: new-linux-amd64
+    spot: "false"
   linux-amd64-mgpu:
     family: ["g4dn.12xlarge"]
     image: linux-amd64
     spot: "false"
+  new-linux-amd64-mgpu:
+    family: ["g4dn.12xlarge"]
+    image: new-linux-amd64
+    spot: "false"
   linux-arm64-cpu:
     cpu: 16
     family: ["c6g", "c7g"]

From c4ce53febdf98b16538c031d646d7102d5b443ab Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sun, 24 Aug 2025 01:17:30 -0700
Subject: [PATCH 150/224] [jvm-packages] Fix Maven deployment for
 xgboost4j-spark-gpu (#11653)

* Fully specify Maven coordinates for xgboost4j-spark-gpu

* Require version args when running change_version.py
---
 jvm-packages/xgboost4j-spark-gpu/pom.xml | 2 ++
 ops/script/change_version.py             | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index 87d6ab78e04f..d3d2138846a1 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -9,7 +9,9 @@
         <version>3.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark-gpu</name>
+    <groupId>ml.dmlc</groupId>
     <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
+    <version>3.1.0-SNAPSHOT</version>
     <description>JVM Package for XGBoost</description>
     <url>https://github.com/dmlc/xgboost/tree/master/jvm-packages</url>
     <licenses>
diff --git a/ops/script/change_version.py b/ops/script/change_version.py
index 19a8c5f8c24c..7e8bac4fc0fc 100644
--- a/ops/script/change_version.py
+++ b/ops/script/change_version.py
@@ -152,9 +152,9 @@ def main(args: argparse.Namespace) -> None:
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--major", type=int)
-    parser.add_argument("--minor", type=int)
-    parser.add_argument("--patch", type=int)
+    parser.add_argument("--major", type=int, required=True)
+    parser.add_argument("--minor", type=int, required=True)
+    parser.add_argument("--patch", type=int, required=True)
     parser.add_argument("--rc", type=int, default=0)
     parser.add_argument("--is-rc", action="https://github.com/store_true")
     parser.add_argument("--is-dev", action="https://github.com/store_true")

From fd011485d1cf343e7d1318565080fc84c5d299b5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 25 Aug 2025 14:25:03 +0800
Subject: [PATCH 151/224] [mt] Define the base score as an array. (#11651)

---
 R-package/src/Makevars.in            |   1 +
 R-package/src/Makevars.win.in        |   1 +
 src/common/param_array.cc            | 110 +++++++++++++++++++++++++++
 src/common/param_array.h             |  63 +++++++++++++++
 src/common/quantile_loss_utils.cc    |  67 +---------------
 src/common/quantile_loss_utils.h     |  35 ++-------
 src/learner.cc                       |  73 +++++++-----------
 tests/cpp/common/test_param_array.cc |  86 +++++++++++++++++++++
 tests/cpp/test_learner.cc            |   1 -
 tests/cpp/test_multi_target.cc       |   3 +-
 10 files changed, 299 insertions(+), 141 deletions(-)
 create mode 100644 src/common/param_array.cc
 create mode 100644 src/common/param_array.h
 create mode 100644 tests/cpp/common/test_param_array.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index cbe8141740b6..57c515e3515c 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -135,6 +135,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/survival_util.o \
     $(PKGROOT)/src/common/threading_utils.o \
     $(PKGROOT)/src/common/ranking_utils.o \
+    $(PKGROOT)/src/common/param_array.o \
     $(PKGROOT)/src/common/quantile_loss_utils.o \
     $(PKGROOT)/src/common/timer.o \
     $(PKGROOT)/src/common/version.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 1a0e9c8b5bb4..1b84f186201d 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -134,6 +134,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/survival_util.o \
     $(PKGROOT)/src/common/threading_utils.o \
     $(PKGROOT)/src/common/ranking_utils.o \
+    $(PKGROOT)/src/common/param_array.o \
     $(PKGROOT)/src/common/quantile_loss_utils.o \
     $(PKGROOT)/src/common/timer.o \
     $(PKGROOT)/src/common/version.o \
diff --git a/src/common/param_array.cc b/src/common/param_array.cc
new file mode 100644
index 000000000000..e1ca83d48338
--- /dev/null
+++ b/src/common/param_array.cc
@@ -0,0 +1,110 @@
+/**
+ * Copyright 2023-2025, XGBoost contributors
+ */
+#include "param_array.h"
+
+#include <cctype>   // for isspace
+#include <cstddef>  // for size_t
+#include <istream>  // for istream
+#include <ostream>  // for ostream
+#include <string>   // for string
+#include <vector>   // for vector
+
+#include "../common/json_utils.h"  // for TypeCheck
+#include "xgboost/json.h"          // for F32Array, get, Number
+#include "xgboost/json_io.h"       // for JsonWriter
+#include "xgboost/string_view.h"   // for StringView
+
+namespace xgboost::common {
+
+namespace {
+template <bool scalar_compatible>
+std::ostream& WriteStream(std::ostream& os,
+                          const ParamArray<float, scalar_compatible>& array) {  // NOLINT
+  auto const& t = array.Get();
+  if (scalar_compatible && t.size() == 1) {
+    os << Json{Number{array.Get().front()}};
+    return os;
+  }
+
+  F32Array arr{t.size()};
+  for (std::size_t i = 0; i < t.size(); ++i) {
+    arr.Set(i, t[i]);
+  }
+  std::vector<char> stream;
+  JsonWriter writer{&stream};
+  arr.Save(&writer);
+  for (auto c : stream) {
+    os << c;
+  }
+  return os;
+}
+}  // namespace
+
+std::ostream& operator<<(std::ostream& os, const ParamArray<float, false>& array) {  // NOLINT
+  return WriteStream(os, array);
+}
+
+std::ostream& operator<<(std::ostream& os, const ParamArray<float, true>& array) {  // NOLINT
+  return WriteStream(os, array);
+}
+
+namespace {
+template <bool scalar_compatible>
+std::istream& ReadStream(std::istream& is, ParamArray<float, scalar_compatible>& array) {  // NOLINT
+  auto& t = array.Get();
+  t.clear();
+  std::string str;
+  while (!is.eof()) {
+    std::string tmp;
+    is >> tmp;
+    str += tmp;
+  }
+  std::size_t head{0};
+  // unify notation for parsing.
+  while (std::isspace(str[head])) {
+    ++head;
+  }
+  if (str[head] == '(') {
+    str[head] = '[';
+  }
+  auto tail = str.size() - 1;
+  while (std::isspace(str[tail])) {
+    --tail;
+  }
+  if (str[tail] == ')') {
+    str[tail] = ']';
+  }
+
+  auto jarr = Json::Load(StringView{str});
+  // return if there's only one element
+  if (IsA<Number>(jarr)) {
+    t.emplace_back(get<Number const>(jarr));
+    return is;
+  }
+  if (IsA<Integer>(jarr)) {
+    t.emplace_back(get<Integer const>(jarr));
+    return is;
+  }
+
+  auto const& jvec = get<Array const>(jarr);
+  for (auto v : jvec) {
+    TypeCheck<Number, Integer>(v, array.Name());
+    if (IsA<Number>(v)) {
+      t.emplace_back(get<Number const>(v));
+    } else {
+      t.emplace_back(get<Integer const>(v));
+    }
+  }
+  return is;
+}
+}  // namespace
+
+std::istream& operator>>(std::istream& is, ParamArray<float, false>& array) {  // NOLINT
+  return ReadStream(is, array);
+}
+
+std::istream& operator>>(std::istream& is, ParamArray<float, true>& array) {  // NOLINT
+  return ReadStream(is, array);
+}
+}  // namespace xgboost::common
diff --git a/src/common/param_array.h b/src/common/param_array.h
new file mode 100644
index 000000000000..7b0912498219
--- /dev/null
+++ b/src/common/param_array.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2023-2025, XGBoost contributors
+ */
+#pragma once
+
+#include <istream>  // for istream
+#include <ostream>  // for ostream
+#include <string>   // for string
+#include <utility>  // for forward
+#include <vector>   // for vector
+
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::common {
+/**
+ * @brief A shim to enable ADL for parameter parsing. Alternatively, we can put the stream
+ * operators in std namespace, which seems to be less ideal.
+ *
+ * @tparam scalar_compatible To help avoid breaking change for parameters that are used to be
+ * scalars.
+ */
+template <typename T, bool scalar_compatible>
+class ParamArray {
+  std::string name_;
+  std::vector<T> values_;
+
+ public:
+  using size_type = typename decltype(values_)::size_type;              // NOLINT
+  using const_reference = typename decltype(values_)::const_reference;  // NOLINT
+
+ public:
+  ParamArray() = default;
+
+  ParamArray(ParamArray const& that) = default;
+  ParamArray& operator=(ParamArray const& that) = default;
+
+  ParamArray(ParamArray&& that) = default;
+  ParamArray& operator=(ParamArray&& that) = default;
+
+  template <typename... Args>
+  explicit ParamArray(StringView name, Args&&... args)
+      : name_{name}, values_{std::forward<Args>(args)...} {}
+
+  [[nodiscard]] std::vector<T>& Get() { return values_; }
+  [[nodiscard]] std::vector<T> const& Get() const { return values_; }
+  const_reference operator[](size_type i) const { return values_[i]; }
+  [[nodiscard]] bool empty() const { return values_.empty(); }       // NOLINT
+  [[nodiscard]] std::size_t size() const { return values_.size(); }  // NOLINT
+  [[nodiscard]] auto data() const { return values_.data(); }         // NOLINT
+  ParamArray& operator=(std::vector<T> const& that) {
+    this->values_ = that;
+    return *this;
+  }
+  [[nodiscard]] StringView Name() const { return this->name_; }
+};
+
+// For parsing array-based parameters inside DMLC parameter. Input can be a string to a
+// single float or a list of floats.
+std::ostream& operator<<(std::ostream& os, const ParamArray<float, false>& t);
+std::ostream& operator<<(std::ostream& os, const ParamArray<float, true>& t);
+std::istream& operator>>(std::istream& is, ParamArray<float, false>& t);
+std::istream& operator>>(std::istream& is, ParamArray<float, true>& t);
+}  // namespace xgboost::common
diff --git a/src/common/quantile_loss_utils.cc b/src/common/quantile_loss_utils.cc
index df2fa6edd868..b4a33580e5d0 100644
--- a/src/common/quantile_loss_utils.cc
+++ b/src/common/quantile_loss_utils.cc
@@ -1,73 +1,8 @@
 /**
- * Copyright 2023, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #include "quantile_loss_utils.h"
 
-#include <cctype>   // for isspace
-#include <istream>  // for istream
-#include <ostream>  // for ostream
-#include <string>   // for string
-#include <vector>   // for vector
-
-#include "../common/json_utils.h"  // for TypeCheck
-#include "xgboost/json.h"          // for F32Array, get, Number
-#include "xgboost/json_io.h"       // for JsonWriter
-
 namespace xgboost::common {
-std::ostream& operator<<(std::ostream& os, const ParamFloatArray& array) {
-  auto const& t = array.Get();
-  xgboost::F32Array arr{t.size()};
-  for (std::size_t i = 0; i < t.size(); ++i) {
-    arr.Set(i, t[i]);
-  }
-  std::vector<char> stream;
-  xgboost::JsonWriter writer{&stream};
-  arr.Save(&writer);
-  for (auto c : stream) {
-    os << c;
-  }
-  return os;
-}
-
-std::istream& operator>>(std::istream& is, ParamFloatArray& array) {
-  auto& t = array.Get();
-  t.clear();
-  std::string str;
-  while (!is.eof()) {
-    std::string tmp;
-    is >> tmp;
-    str += tmp;
-  }
-  std::size_t head{0};
-  // unify notation for parsing.
-  while (std::isspace(str[head])) {
-    ++head;
-  }
-  if (str[head] == '(') {
-    str[head] = '[';
-  }
-  auto tail = str.size() - 1;
-  while (std::isspace(str[tail])) {
-    --tail;
-  }
-  if (str[tail] == ')') {
-    str[tail] = ']';
-  }
-
-  auto jarr = xgboost::Json::Load(xgboost::StringView{str});
-  // return if there's only one element
-  if (xgboost::IsA<xgboost::Number>(jarr)) {
-    t.emplace_back(xgboost::get<xgboost::Number const>(jarr));
-    return is;
-  }
-
-  auto jvec = xgboost::get<xgboost::Array const>(jarr);
-  for (auto v : jvec) {
-    xgboost::TypeCheck<xgboost::Number>(v, "alpha");
-    t.emplace_back(get<xgboost::Number const>(v));
-  }
-  return is;
-}
-
 DMLC_REGISTER_PARAMETER(QuantileLossParam);
 }  // namespace xgboost::common
diff --git a/src/common/quantile_loss_utils.h b/src/common/quantile_loss_utils.h
index bc781de259f1..ad1f96a15441 100644
--- a/src/common/quantile_loss_utils.h
+++ b/src/common/quantile_loss_utils.h
@@ -1,39 +1,19 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #ifndef XGBOOST_COMMON_QUANTILE_LOSS_UTILS_H_
 #define XGBOOST_COMMON_QUANTILE_LOSS_UTILS_H_
 
-#include <algorithm>            // std::all_of
-#include <istream>              // std::istream
-#include <ostream>              // std::ostream
-#include <vector>               // std::vector
+#include <algorithm>  // std::all_of
+#include <vector>     // std::vector
 
+#include "param_array.h"
 #include "xgboost/logging.h"    // CHECK
 #include "xgboost/parameter.h"  // XGBoostParameter
 
-namespace xgboost {
-namespace common {
-// A shim to enable ADL for parameter parsing. Alternatively, we can put the stream
-// operators in std namespace, which seems to be less ideal.
-class ParamFloatArray {
-  std::vector<float> values_;
-
- public:
-  std::vector<float>& Get() { return values_; }
-  std::vector<float> const& Get() const { return values_; }
-  decltype(values_)::const_reference operator[](decltype(values_)::size_type i) const {
-    return values_[i];
-  }
-};
-
-// For parsing quantile parameters. Input can be a string to a single float or a list of
-// floats.
-std::ostream& operator<<(std::ostream& os, const ParamFloatArray& t);
-std::istream& operator>>(std::istream& is, ParamFloatArray& t);
-
+namespace xgboost::common {
 struct QuantileLossParam : public XGBoostParameter<QuantileLossParam> {
-  ParamFloatArray quantile_alpha;
+  ParamArray<float, false> quantile_alpha{"quantile_alpha"};
   DMLC_DECLARE_PARAMETER(QuantileLossParam) {
     DMLC_DECLARE_FIELD(quantile_alpha).describe("List of quantiles for quantile loss.");
   }
@@ -46,6 +26,5 @@ struct QuantileLossParam : public XGBoostParameter<QuantileLossParam> {
     CHECK(valid) << "quantile alpha must be in the range [0.0, 1.0].";
   }
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_QUANTILE_LOSS_UTILS_H_
diff --git a/src/learner.cc b/src/learner.cc
index 084cece3bd1e..249775dd821c 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -11,7 +11,6 @@
 #include <dmlc/thread_local.h>            // for ThreadLocalStore
 
 #include <algorithm>                      // for equal, max, transform, sort, find_if, all_of
-#include <array>                          // for array
 #include <atomic>                         // for atomic
 #include <cctype>                         // for isalpha, isspace
 #include <cmath>                          // for isnan, isinf
@@ -34,6 +33,7 @@
 #include "collective/aggregator.h"        // for ApplyWithLabels
 #include "collective/communicator-inl.h"  // for Allreduce, Broadcast, GetRank, IsDistributed
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
+#include "common/param_array.h"           // for ParamArray
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
@@ -82,21 +82,21 @@ T& UsePtr(T& ptr) {  // NOLINT
  * with objective by itself.
  */
 struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy> {
-  /* \brief global bias */
-  float base_score{ObjFunction::DefaultBaseScore()};
-  /* \brief number of features  */
+  /** @brief Global bias/intercept. */
+  common::ParamArray<float, true> base_score{"base_score", ObjFunction::DefaultBaseScore()};
+  /** @brief number of features  */
   bst_feature_t num_feature{0};
-  /* \brief number of classes, if it is multi-class classification  */
+  /** @brief number of classes, if it is multi-class classification  */
   std::int32_t num_class{0};
-  /*! \brief the version of XGBoost. */
+  /**! @brief the version of XGBoost. */
   std::int32_t major_version{std::get<0>(Version::Self())};
   std::int32_t minor_version{std::get<1>(Version::Self())};
   /**
-   * \brief Number of target variables.
+   * @brief Number of target variables.
    */
   bst_target_t num_target{1};
   /**
-   * \brief Whether we should calculate the base score from training data.
+   * @brief Whether we should calculate the base score from training data.
    *
    *   This is a private parameter as we can't expose it as boolean due to binary model
    *   format. Exposing it as integer creates inconsistency with other parameters.
@@ -108,17 +108,15 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
 
   LearnerModelParamLegacy() = default;
 
-  // Skip other legacy fields.
   [[nodiscard]] Json ToJson() const {
     Json obj{Object{}};
-    char floats[NumericLimits<float>::kToCharsSize];
-    auto ret = to_chars(floats, floats + NumericLimits<float>::kToCharsSize, base_score);
-    CHECK(ret.ec == std::errc{});
-    obj["base_score"] = std::string{floats, static_cast<size_t>(std::distance(floats, ret.ptr))};
+    std::stringstream ss;
+    ss << base_score;
+    obj["base_score"] = ss.str();
 
     char integers[NumericLimits<int64_t>::kToCharsSize];
-    ret = to_chars(integers, integers + NumericLimits<int64_t>::kToCharsSize,
-                   static_cast<int64_t>(num_feature));
+    auto ret = to_chars(integers, integers + NumericLimits<int64_t>::kToCharsSize,
+                        static_cast<int64_t>(num_feature));
     CHECK(ret.ec == std::errc());
     obj["num_feature"] =
         std::string{integers, static_cast<size_t>(std::distance(integers, ret.ptr))};
@@ -152,11 +150,9 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     if (bse_it != j_param.cend()) {
       m["boost_from_average"] = get<String const>(bse_it->second);
     }
-
-    this->Init(m);
-
     std::string str = get<String const>(j_param.at("base_score"));
-    from_chars(str.c_str(), str.c_str() + str.size(), base_score);
+    m["base_score"] = str;
+    this->Init(m);
   }
 
   template <typename Container>
@@ -172,39 +168,27 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     }
     return dmlc::Parameter<LearnerModelParamLegacy>::UpdateAllowUnknown(kwargs);
   }
-  // sanity check
+  // Sanity checks
   void Validate(Context const* ctx) {
+    CHECK_GE(this->base_score.size(), 1);
     if (!collective::IsDistributed()) {
       return;
     }
 
-    std::array<std::int32_t, 6> data;
-    std::size_t pos{0};
-    std::memcpy(data.data() + pos, &base_score, sizeof(base_score));
-    pos += 1;
-    std::memcpy(data.data() + pos, &num_feature, sizeof(num_feature));
-    pos += 1;
-    std::memcpy(data.data() + pos, &num_class, sizeof(num_class));
-    pos += 1;
-    std::memcpy(data.data() + pos, &num_target, sizeof(num_target));
-    pos += 1;
-    std::memcpy(data.data() + pos, &major_version, sizeof(major_version));
-    pos += 1;
-    std::memcpy(data.data() + pos, &minor_version, sizeof(minor_version));
-
-    std::array<std::int32_t, 6> sync;
-    std::copy(data.cbegin(), data.cend(), sync.begin());
+    std::vector<char> data;
+    Json::Dump(this->ToJson(), &data, std::ios::binary);
+    std::vector<char> sync{data};
+
     auto rc = collective::Broadcast(ctx, linalg::MakeVec(sync.data(), sync.size()), 0);
     collective::SafeColl(rc);
+
     CHECK(std::equal(data.cbegin(), data.cend(), sync.cbegin()))
         << "Different model parameter across workers.";
   }
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(LearnerModelParamLegacy) {
-    DMLC_DECLARE_FIELD(base_score)
-        .set_default(ObjFunction::DefaultBaseScore())
-        .describe("Global bias of the model.");
+    DMLC_DECLARE_FIELD(base_score).describe("Global bias of the model.");
     DMLC_DECLARE_FIELD(num_feature)
         .set_default(0)
         .describe(
@@ -357,7 +341,8 @@ class LearnerConfiguration : public Learner {
     auto h_base_score = base_score.HostView();
 
     // Transform to margin. (apply the link function)
-    h_base_score(0) = obj_->ProbToMargin(mparam_.base_score);
+    CHECK(!this->mparam_.base_score.empty());
+    h_base_score(0) = obj_->ProbToMargin(mparam_.base_score[0]);
     CHECK(tparam_.GetInitialised());
     // move it to model param, which is shared with all other components.
     learner_model_param_ =
@@ -384,15 +369,15 @@ class LearnerConfiguration : public Learner {
         linalg::Vector<float> base_score;
         this->InitEstimation(info, &base_score);
         CHECK_EQ(base_score.Size(), 1);
-        mparam_.base_score = base_score(0);
-        CHECK(!std::isnan(mparam_.base_score));
+        mparam_.base_score = base_score.Data()->ConstHostVector();
+        CHECK(!std::isnan(mparam_.base_score[0]));
       }
       // Update the shared model parameter
       this->ConfigureModelParamWithoutBaseScore();
       mparam_.Validate(&ctx_);
     }
-    CHECK(!std::isnan(mparam_.base_score));
-    CHECK(!std::isinf(mparam_.base_score));
+    CHECK(!mparam_.base_score.empty() && !std::isnan(mparam_.base_score[0]));
+    CHECK(!std::isinf(mparam_.base_score[0]));
   }
 
  public:
diff --git a/tests/cpp/common/test_param_array.cc b/tests/cpp/common/test_param_array.cc
new file mode 100644
index 000000000000..77d8a00db66c
--- /dev/null
+++ b/tests/cpp/common/test_param_array.cc
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <xgboost/base.h>         // for kRtEps
+#include <xgboost/json.h>         // for Json
+#include <xgboost/parameter.h>    // for XGBoostParameter
+#include <xgboost/string_view.h>  // for StringView
+
+#include <sstream>  // for istringstream, ostringstream
+#include <string>   // for string
+
+#include "../../../src/common/param_array.h"
+#include "../helpers.h"
+
+namespace xgboost::common {
+TEST(ParamArray, Float) {
+  ParamArray<float, false> values{"values"};
+  {
+    std::istringstream sin{"1.1"};
+    sin >> values;
+    ASSERT_EQ(values.size(), 1);
+    ASSERT_NEAR(values[0], 1.1, kRtEps);
+    std::ostringstream sout;
+    sout << values;
+    auto jarr = Json::Load(StringView{sout.str()});
+    for (std::size_t i = 0; i < values.size(); ++i) {
+      ASSERT_EQ(get<Number const>(jarr[i]), values[i]);
+    }
+  }
+  {
+    std::string str = "[1.1, 1.3]";
+    std::istringstream sin{str};
+    sin >> values;
+    ASSERT_EQ(values.size(), 2);
+    ASSERT_NEAR(values[0], 1.1, kRtEps);
+    ASSERT_NEAR(values[1], 1.3, kRtEps);
+    std::ostringstream sout;
+    sout << values;
+    auto jarr = Json::Load(StringView{sout.str()});
+    for (std::size_t i = 0; i < values.size(); ++i) {
+      ASSERT_EQ(get<Number const>(jarr[i]), values[i]);
+    }
+  }
+  {
+    ParamArray<float, true> values{"values"};
+    std::istringstream sin{"1.1"};
+    sin >> values;
+    ASSERT_EQ(values.size(), 1);
+    ASSERT_NEAR(values[0], 1.1, kRtEps);
+    std::ostringstream sout;
+    sout << values;
+    auto jarr = Json::Load(StringView{sout.str()});
+    ASSERT_TRUE(IsA<Number>(jarr));
+    ASSERT_NEAR(get<Number>(jarr), 1.1, kRtEps);
+  }
+  {
+    ParamArray<float, true> values{"values"};
+    std::istringstream sin{"[\"foo\"]"};
+    ASSERT_THAT(
+        [&] { sin >> values; },
+        GMockThrow(
+            R"(Invalid type for: `values`, expecting one of the: {`Number`, `Integer`}, got: `String`)"));
+  }
+}
+
+namespace {
+struct TestParamArray : public XGBoostParameter<TestParamArray> {
+  ParamArray<float, false> test_key{"test_key", 0.2f};
+  DMLC_DECLARE_PARAMETER(TestParamArray) {
+    DMLC_DECLARE_FIELD(test_key).describe("test").set_default(
+        ParamArray<float, false>{"test_key", 0.2f});
+  }
+};
+
+DMLC_REGISTER_PARAMETER(TestParamArray);
+}  // namespace
+
+TEST(ParamArray, Update) {
+  TestParamArray param;
+  param.UpdateAllowUnknown(Args{{}});
+  ASSERT_EQ(param.test_key.size(), 1);
+  ASSERT_EQ(param.test_key.Name(), "test_key");
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 6ed1bb2a8f9c..b2590d18bd64 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -27,7 +27,6 @@
 #include "../../src/common/linalg_op.h"             // for ElementWiseTransformHost, begin, end
 #include "../../src/common/random.h"                // for GlobalRandom
 #include "./collective/test_worker.h"               // for TestDistributedGlobal
-#include "dmlc/io.h"                                // for Stream
 #include "dmlc/omp.h"                               // for omp_get_max_threads
 #include "filesystem.h"                             // for TemporaryDirectory
 #include "helpers.h"                                // for GetBaseScore, RandomDataGenerator
diff --git a/tests/cpp/test_multi_target.cc b/tests/cpp/test_multi_target.cc
index cc81a4ba2ccc..763d647751a5 100644
--- a/tests/cpp/test_multi_target.cc
+++ b/tests/cpp/test_multi_target.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>                         // for Args, bst_target_t
@@ -16,7 +16,6 @@
 
 #include "../../src/common/linalg_op.h"           // for begin, cbegin, cend
 #include "../../src/common/stats.h"               // for Median
-#include "../../src/common/transform_iterator.h"  // for IndexTransformIter
 #include "helpers.h"                              // for RandomDataGenerator
 #include "xgboost/host_device_vector.h"           // for HostDeviceVector
 #include "xgboost/linalg.h"                       // for Tensor, All, TensorView, Vector

From c50e81da192b293731572153edd49d2eaa7dd022 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 26 Aug 2025 06:58:07 +0800
Subject: [PATCH 152/224] Cleanup for the linalg module. (#11658)

- Extract the vector IO logic.
- Remove the custom apply function.
- Use the new block-based utility.
---
 include/xgboost/linalg.h        | 17 --------
 src/common/json_utils.h         | 40 +++++++++++++++++-
 src/common/linalg_op.cuh        |  4 +-
 src/common/linalg_op.h          | 74 ++++++++++++++++++++++-----------
 src/common/stats.cu             |  8 ++--
 src/data/data.cu                |  3 +-
 src/objective/lambdarank_obj.cc | 34 +++------------
 tests/cpp/common/test_linalg.cc | 42 ++++++++++++++++++-
 8 files changed, 141 insertions(+), 81 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 9cf13416d555..9593f280807e 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -225,23 +225,6 @@ void ReshapeImpl(size_t (&out_shape)[D], I &&s, S &&...rest) {
   ReshapeImpl<dim + 1>(out_shape, std::forward<S>(rest)...);
 }
 
-template <typename Fn, typename Tup, size_t... I>
-LINALG_HD decltype(auto) constexpr Apply(Fn &&f, Tup &&t, std::index_sequence<I...>) {
-  return f(std::get<I>(t)...);
-}
-
-/**
- * C++ 17 style apply.
- *
- * \param f function to apply
- * \param t tuple of arguments
- */
-template <typename Fn, typename Tup>
-LINALG_HD decltype(auto) constexpr Apply(Fn &&f, Tup &&t) {
-  constexpr auto kSize = std::tuple_size<Tup>::value;
-  return Apply(std::forward<Fn>(f), std::forward<Tup>(t), std::make_index_sequence<kSize>{});
-}
-
 /**
  * C++ 17 conjunction
  */
diff --git a/src/common/json_utils.h b/src/common/json_utils.h
index 812aa1f2e57a..a9e07c31e082 100644
--- a/src/common/json_utils.h
+++ b/src/common/json_utils.h
@@ -1,12 +1,14 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  *
  * @brief Utils tailored for XGBoost.
  */
 #pragma once
 
+#include <algorithm>    // for transform, copy
 #include <string>       // for string
 #include <type_traits>  // for enable_if_t, remove_const_t
+#include <vector>       // for vector
 
 #include "xgboost/json.h"
 #include "xgboost/string_view.h"  // for StringView
@@ -71,4 +73,40 @@ auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
   }
   return dft;
 }
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>> * = nullptr>
+void SaveVector(std::vector<T> const &in, Json *p_out) {
+  auto &out = *p_out;
+  if (IsA<F32Array>(out)) {
+    auto &out_array = get<F32Array>(out);
+    out_array.resize(in.size());
+    std::copy(in.cbegin(), in.cend(), out_array.begin());
+  } else if (IsA<F64Array>(out)) {
+    auto &out_array = get<F64Array>(out);
+    out_array.resize(in.size());
+    std::copy(in.cbegin(), in.cend(), out_array.begin());
+  } else {
+    LOG(FATAL) << "Invalid array type.";
+  }
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>> * = nullptr>
+void LoadVector(Json const &in, std::vector<T> *out) {
+  if (IsA<F32Array>(in)) {
+    // JSON
+    auto const &array = get<F32Array const>(in);
+    out->resize(array.size());
+    std::copy(array.cbegin(), array.cend(), out->begin());
+  } else if (IsA<F64Array>(in)) {
+    auto const &array = get<F64Array const>(in);
+    out->resize(array.size());
+    std::copy(array.cbegin(), array.cend(), out->begin());
+  } else {
+    // UBJSON
+    auto const &array = get<Array const>(in);
+    out->resize(array.size());
+    std::transform(array.cbegin(), array.cend(), out->begin(),
+                   [](Json const &v) { return get<Number const>(v); });
+  }
+}
 }  // namespace xgboost
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 0920f99ad6cc..4ae64d2e7993 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -50,7 +50,7 @@ void ElementWiseTransformDevice(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nu
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
-      T& v = detail::Apply(t, UnravelIndex(i, t.Shape()));
+      T& v = std::apply(t, UnravelIndex(i, t.Shape()));
       v = fn(i, v);
     });
   }
@@ -67,7 +67,7 @@ template <typename T, std::int32_t kDim>
 struct IterOp {
   TensorView<T, kDim> v;
   XGBOOST_DEVICE T& operator()(std::size_t i) {
-    return detail::Apply(v, UnravelIndex(i, v.Shape()));
+    return std::apply(v, UnravelIndex(i, v.Shape()));
   }
 };
 }  // namespace detail
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index 929e4c6ed647..605bc6d2975a 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -1,49 +1,66 @@
 /**
- * Copyright 2021-2023, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_LINALG_OP_H_
 #define XGBOOST_COMMON_LINALG_OP_H_
-#include <cstdint>  // std::int32_t
-#include <type_traits>
 
-#include "common.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <tuple>        // for apply
+#include <type_traits>  // for conditional_t
+
+#include "json_utils.h"  // for LoadVector, SaveVector
 #include "threading_utils.h"
-#include "transform_iterator.h"  // MakeIndexTransformIter
-#include "xgboost/context.h"     // Context
+#include "transform_iterator.h"  // for MakeIndexTransformIter
+#include "xgboost/json.h"        // for Json
 #include "xgboost/linalg.h"
 
-namespace xgboost {
-namespace linalg {
+#if !defined(XGBOOST_USE_CUDA)
+
+#include "common.h"           // for AssertGPUSupport
+#include "xgboost/context.h"  // for Context
+
+#endif  // !defined(XGBOOST_USE_CUDA)
+
+namespace xgboost::linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseTransformHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& fn) {
   if (t.Contiguous()) {
     auto ptr = t.Values().data();
-    common::ParallelFor(t.Size(), n_threads, [&](size_t i) { ptr[i] = fn(i, ptr[i]); });
+    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {
-    common::ParallelFor(t.Size(), n_threads, [&](size_t i) {
-      auto& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
+    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) {
+      auto& v = std::apply(t, linalg::UnravelIndex(i, t.Shape()));
       v = fn(i, v);
     });
   }
 }
 
 template <typename T, std::int32_t D, typename Fn>
-void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, Fn &&fn) {
+void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, Fn&& fn) {
+  constexpr std::size_t kBlockSize = 2048;
   if constexpr (D == 1) {
-    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) { fn(i); });
+    common::ParallelFor1d<kBlockSize>(t.Size(), n_threads, [&](auto&& block) {
+      for (std::size_t i = block.begin(); i < block.end(); ++i) {
+        fn(i);
+      }
+    });
   } else if (D == 2 && t.CContiguous() && t.Shape(0) > t.Shape(1) * 64) {
     // Heuristic. Tall, c-contiguous matrix,
     auto n_rows = t.Shape(0);
     auto n_columns = t.Shape(1);
-    common::ParallelFor(n_rows, n_threads, [&](std::size_t i) {
-      for (std::size_t j = 0; j < n_columns; ++j) {
-        fn(i, j);
+    common::ParallelFor1d<kBlockSize>(n_rows, n_threads, [&](auto&& block) {
+      for (std::size_t i = block.begin(); i < block.end(); ++i) {
+        for (std::size_t j = 0; j < n_columns; ++j) {
+          fn(i, j);
+        }
       }
     });
   } else {
-    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) {
-      auto idx = linalg::UnravelIndex(i, t.Shape());
-      std::apply(fn, idx);
+    common::ParallelFor1d<kBlockSize>(t.Size(), n_threads, [&](auto&& block) {
+      for (std::size_t i = block.begin(); i < block.end(); ++i) {
+        std::apply(fn, linalg::UnravelIndex(i, t.Shape()));
+      }
     });
   }
 }
@@ -70,8 +87,8 @@ void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn)
 
 template <typename T, std::int32_t kDim>
 auto cbegin(TensorView<T, kDim> const& v) {  // NOLINT
-  auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& {
-    return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape()));
+  auto it = common::MakeIndexTransformIter([&](std::size_t i) -> std::remove_cv_t<T> const& {
+    return std::apply(v, linalg::UnravelIndex(i, v.Shape()));
   });
   return it;
 }
@@ -84,7 +101,7 @@ auto cend(TensorView<T, kDim> const& v) {  // NOLINT
 template <typename T, std::int32_t kDim>
 auto begin(TensorView<T, kDim>& v) {  // NOLINT
   auto it = common::MakeIndexTransformIter(
-      [&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); });
+      [&](std::size_t i) -> T& { return std::apply(v, linalg::UnravelIndex(i, v.Shape())); });
   return it;
 }
 
@@ -92,6 +109,15 @@ template <typename T, std::int32_t kDim>
 auto end(TensorView<T, kDim>& v) {  // NOLINT
   return begin(v) + v.Size();
 }
-}  // namespace linalg
-}  // namespace xgboost
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+void SaveVector(linalg::Vector<T> const& in, Json* p_out) {
+  ::xgboost::SaveVector(in.Data()->HostVector(), p_out);
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+void LoadVector(Json const& in, linalg::Vector<T>* out) {
+  ::xgboost::LoadVector(in, &out->Data()->HostVector());
+}
+}  // namespace xgboost::linalg
 #endif  // XGBOOST_COMMON_LINALG_OP_H_
diff --git a/src/common/stats.cu b/src/common/stats.cu
index 3986ed9f553f..3bfab8c25db9 100644
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -1,10 +1,11 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 
 #include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
 
 #include <cstddef>  // size_t
+#include <tuple>    // for apply
 
 #include "../collective/aggregator.h"  // for GlobalSum
 #include "cuda_context.cuh"            // CUDAContext
@@ -26,9 +27,8 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
   dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
               [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
   auto val_it = dh::MakeTransformIterator<float>(
-      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) {
-        return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
-      });
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(size_t i) { return std::apply(t, linalg::UnravelIndex(i, t.Shape())); });
 
   out->SetDevice(ctx->Device());
   out->Reshape(t.Shape(1));
diff --git a/src/data/data.cu b/src/data/data.cu
index 17fc54a562a4..59cca48bea28 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -57,8 +57,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
   linalg::ElementWiseTransformDevice(
       t,
       [=] __device__(size_t i, T) {
-        return linalg::detail::Apply(TypedIndex<T, D>{array},
-                                     linalg::UnravelIndex<D>(i, array.shape));
+        return std::apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, array.shape));
       },
       ctx->Stream());
 }
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index 45ea357425b0..cd53089b958f 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -16,10 +16,9 @@
 #include <tuple>        // for apply, make_tuple
 #include <type_traits>  // for is_floating_point
 #include <utility>      // for pair, swap
-#include <vector>       // for vector
 
 #include "../common/error_msg.h"         // for GroupWeight, LabelScoreSize
-#include "../common/linalg_op.h"         // for begin, cbegin, cend
+#include "../common/linalg_op.h"         // for begin, cbegin, cend, SaveVector
 #include "../common/optional_weight.h"   // for MakeOptionalWeights, OptionalWeights
 #include "../common/ranking_utils.h"     // for RankingCache, LambdaRankParam, MAPCache, NDCGC...
 #include "../common/threading_utils.h"   // for ParallelFor, Sched
@@ -257,18 +256,11 @@ class LambdaRankObj : public FitIntercept {
     out["name"] = String(Loss::Name());
     out["lambdarank_param"] = ToJson(param_);
 
-    auto save_bias = [](linalg::Vector<double> const& in, Json out) {
-      auto& out_array = get<F32Array>(out);
-      out_array.resize(in.Size());
-      auto h_in = in.HostView();
-      std::copy(linalg::cbegin(h_in), linalg::cend(h_in), out_array.begin());
-    };
-
     if (param_.lambdarank_unbiased) {
       out["ti+"] = F32Array();
-      save_bias(ti_plus_, out["ti+"]);
+      linalg::SaveVector(ti_plus_, &out["ti+"]);
       out["tj-"] = F32Array();
-      save_bias(tj_minus_, out["tj-"]);
+      linalg::SaveVector(tj_minus_, &out["tj-"]);
     }
   }
   void LoadConfig(Json const& in) override {
@@ -278,24 +270,8 @@ class LambdaRankObj : public FitIntercept {
     }
 
     if (param_.lambdarank_unbiased) {
-      auto load_bias = [](Json in, linalg::Vector<double>* out) {
-        if (IsA<F32Array>(in)) {
-          // JSON
-          auto const& array = get<F32Array>(in);
-          out->Reshape(array.size());
-          auto h_out = out->HostView();
-          std::copy(array.cbegin(), array.cend(), linalg::begin(h_out));
-        } else {
-          // UBJSON
-          auto const& array = get<Array>(in);
-          out->Reshape(array.size());
-          auto h_out = out->HostView();
-          std::transform(array.cbegin(), array.cend(), linalg::begin(h_out),
-                         [](Json const& v) { return get<Number const>(v); });
-        }
-      };
-      load_bias(in["ti+"], &ti_plus_);
-      load_bias(in["tj-"], &tj_minus_);
+      linalg::LoadVector(in["ti+"], &ti_plus_);
+      linalg::LoadVector(in["tj-"], &tj_minus_);
     }
   }
 
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index a9adccd0a511..c3dba9375180 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
@@ -8,7 +8,7 @@
 
 #include <cstddef>  // size_t
 #include <numeric>  // iota
-#include <vector>
+#include <vector>   // for vector
 
 #include "../../../src/common/linalg_op.h"
 
@@ -372,4 +372,42 @@ TEST(Linalg, FOrder) {
     k += kCols;
   }
 }
+
+TEST(Linalg, IO) {
+  std::vector<double> data(128, 0);
+  std::iota(data.begin(), data.end(), 0.0f);
+  Vector<double> vec(data.begin(), data.end(), {data.size()}, DeviceOrd::CPU());
+  Json jvec{F32Array{}};
+  SaveVector(vec, &jvec);
+
+  auto check = [&data](linalg::Vector<double> const &loaded) {
+    ASSERT_EQ(loaded.Size(), data.size());
+    for (std::size_t i = 0; i < data.size(); ++i) {
+      ASSERT_NEAR(data[i], loaded(i), kRtEps);
+    }
+  };
+
+  {
+    auto str = Json::Dump(jvec);
+    auto jloaded = Json::Load(StringView{str});
+
+    Vector<double> loaded;
+    LoadVector(jloaded, &loaded);
+    check(loaded);
+  }
+  {
+    Vector<double> loaded;
+    LoadVector(jvec, &loaded);
+    check(loaded);
+  }
+  {
+    std::vector<char> str;
+    Json::Dump(jvec, &str, std::ios::binary);
+    auto jloaded = Json::Load(StringView{str.data(), str.size()}, std::ios::binary);
+
+    Vector<double> loaded;
+    LoadVector(jloaded, &loaded);
+    check(loaded);
+  }
+}
 }  // namespace xgboost::linalg

From f8dc85b3cd32925480a7df39fba1a510384eaa5c Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 26 Aug 2025 17:26:06 -0700
Subject: [PATCH 153/224] Revert "Add an option to test new VM image from
 dmlc/xgboost-devops#39"

This reverts commit ffb27253f38c78676a227dd6d256d5a0ac286f66.
---
 .github/runs-on.yml | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index 346ed83dacad..1d97b8c5de21 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -6,11 +6,6 @@ images:
     arch: "x64"
     owner: "492475357299"  # XGBooost CI
     name: "xgboost-ci-runs-on-linux-amd64-*"
-  new-linux-amd64:
-    platform: "linux"
-    arch: "x64"
-    owner: "492475357299"  # XGBooost CI
-    name: "new-xgboost-ci-runs-on-linux-amd64-*"
   linux-arm64:
     platform: "linux"
     arch: "arm64"
@@ -27,26 +22,14 @@ runners:
     cpu: 16
     family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"]
     image: linux-amd64
-  new-linux-amd64-cpu:
-    cpu: 16
-    family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"]
-    image: new-linux-amd64
   linux-amd64-gpu:
     family: ["g4dn.xlarge"]
     image: linux-amd64
     spot: "false"
-  new-linux-amd64-gpu:
-    family: ["g4dn.xlarge"]
-    image: new-linux-amd64
-    spot: "false"
   linux-amd64-mgpu:
     family: ["g4dn.12xlarge"]
     image: linux-amd64
     spot: "false"
-  new-linux-amd64-mgpu:
-    family: ["g4dn.12xlarge"]
-    image: new-linux-amd64
-    spot: "false"
   linux-arm64-cpu:
     cpu: 16
     family: ["c6g", "c7g"]

From 1e684c3350056e14af3d6a646a05c53125f1c7f8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 28 Aug 2025 11:23:31 +0800
Subject: [PATCH 154/224] Cleanup the softmax implementation and cub functions.
 (#11663)

- Group cub function calls.
- Use linalg for the softmax implementation.
- Use valid label values for obj tests.
---
 include/xgboost/objective.h                   |   2 +-
 src/common/algorithm.cuh                      |  38 ++-
 src/common/linalg_op.cuh                      |   4 +-
 src/data/data.cu                              |  15 +-
 src/objective/multiclass_obj.cu               | 223 +++++++++---------
 src/objective/multiclass_param.h              |  22 +-
 tests/cpp/objective/test_objective.cc         |   2 +-
 tests/cpp/objective_helpers.cc                |  23 +-
 tests/cpp/objective_helpers.h                 |   7 +-
 .../federated/test_federated_learner.cc       |  31 +--
 tests/cpp/test_learner.cc                     |  12 +-
 tests/python-gpu/test_gpu_prediction.py       |   1 +
 12 files changed, 197 insertions(+), 183 deletions(-)

diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index 51cd053cd600..dd1cb6fa7dec 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -27,7 +27,7 @@ struct Context;
 /** @brief The interface of objective function */
 class ObjFunction : public Configurable {
  protected:
-  Context const* ctx_;
+  Context const* ctx_{nullptr};
 
  public:
   static constexpr float DefaultBaseScore() { return 0.5f; }
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index e0c1ebd11c25..7b3e7dcce4ec 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -1,19 +1,20 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_ALGORITHM_CUH_
 #define XGBOOST_COMMON_ALGORITHM_CUH_
 
-#include <thrust/copy.h>       // copy
-#include <thrust/sort.h>       // stable_sort_by_key
-#include <thrust/tuple.h>      // tuple,get
+#include <thrust/copy.h>                         // for copy
+#include <thrust/iterator/counting_iterator.h>   // for make_counting_iterator
+#include <thrust/sort.h>                         // for stable_sort_by_key
+#include <thrust/tuple.h>                        // for tuple, get
 
-#include <cstddef>             // size_t
-#include <cstdint>             // int32_t
-#include <cub/cub.cuh>         // DispatchSegmentedRadixSort,NullType,DoubleBuffer
-#include <iterator>            // distance
-#include <limits>              // numeric_limits
-#include <type_traits>         // conditional_t,remove_const_t
+#include <cstddef>      // size_t
+#include <cstdint>      // int32_t
+#include <cub/cub.cuh>  // DispatchSegmentedRadixSort,NullType,DoubleBuffer
+#include <iterator>     // distance
+#include <limits>       // numeric_limits
+#include <type_traits>  // conditional_t,remove_const_t
 
 #include "common.h"            // safe_cuda
 #include "cuda_context.cuh"    // CUDAContext
@@ -21,6 +22,7 @@
 #include "device_vector.cuh"   // for device_vector
 #include "xgboost/base.h"      // XGBOOST_DEVICE
 #include "xgboost/context.h"   // Context
+#include "xgboost/linalg.h"    // for VectorView
 #include "xgboost/logging.h"   // CHECK
 #include "xgboost/span.h"      // Span,byte
 
@@ -327,6 +329,22 @@ void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out
 #endif
 }
 
+template <typename... Args>
+void RunLengthEncode(dh::CUDAStreamView stream, Args &&...args) {
+  std::size_t n_bytes = 0;
+  dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(nullptr, n_bytes, args..., stream));
+  dh::CachingDeviceUVector<char> tmp(n_bytes);
+  dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(tmp.data(), n_bytes, args..., stream));
+}
+
+template <typename... Args>
+void SegmentedSum(dh::CUDAStreamView stream, Args &&...args) {
+  std::size_t n_bytes = 0;
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, args..., stream));
+  dh::CachingDeviceUVector<char> tmp(n_bytes);
+  dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(tmp.data(), n_bytes, args..., stream));
+}
+
 /**
  * @brief Customized version of @ref thrust::all_of
  *
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 4ae64d2e7993..a2ff61d957ab 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_LINALG_OP_CUH_
 #define XGBOOST_COMMON_LINALG_OP_CUH_
@@ -32,7 +32,7 @@ template <typename T>
 struct ElementWiseImpl<T, 1> {
   template <typename Fn>
   void operator()(TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
-    dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) { fn(i); });
+    dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) mutable { fn(i); });
   }
 };
 
diff --git a/src/data/data.cu b/src/data/data.cu
index 59cca48bea28..1d550bf22a66 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  *
  * \file data.cu
  * \brief Handles setting metainfo from array interface.
@@ -7,6 +7,7 @@
 #include <thrust/gather.h>   // for gather
 #include <thrust/logical.h>  // for none_of
 
+#include "../common/algorithm.cuh"  // for RunLengthEncode
 #include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
 #include "../common/linalg_op.cuh"
@@ -102,17 +103,13 @@ void CopyQidImpl(Context const* ctx, ArrayInterface<1> array_interface,
   dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
                            cudaMemcpyDeviceToHost));
   CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
-  size_t bytes = 0;
+
   dh::caching_device_vector<uint32_t> out(array_interface.Shape<0>());
   dh::caching_device_vector<uint32_t> cnt(array_interface.Shape<0>());
   HostDeviceVector<int> d_num_runs_out(1, 0, d);
-  cub::DeviceRunLengthEncode::Encode(nullptr, bytes, it, out.begin(), cnt.begin(),
-                                     d_num_runs_out.DevicePointer(), array_interface.Shape<0>(),
-                                     cuctx->Stream());
-  dh::CachingDeviceUVector<char> tmp(bytes);
-  cub::DeviceRunLengthEncode::Encode(tmp.data(), bytes, it, out.begin(), cnt.begin(),
-                                     d_num_runs_out.DevicePointer(), array_interface.Shape<0>(),
-                                     cuctx->Stream());
+
+  common::RunLengthEncode(cuctx->Stream(), it, out.begin(), cnt.begin(),
+                          d_num_runs_out.DevicePointer(), array_interface.Shape<0>());
 
   auto h_num_runs_out = d_num_runs_out.HostSpan()[0];
   group_ptr_.clear();
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 46837492178d..48f3094547d9 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -1,156 +1,173 @@
 /**
- * Copyright 2015-2023, XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file multi_class.cc
  * \brief Definition of multi-class classification objectives.
  * \author Tianqi Chen
  */
 #include <dmlc/omp.h>
 
-#include <vector>
-#include <algorithm>
+#include <cassert>  // for assert
 #include <limits>
-#include <utility>
 
-#include "xgboost/parameter.h"
+#include "../common/common.h"  // for AssertGPUSupport
+#include "../common/linalg_op.h"
+#include "../common/math.h"
+#include "../common/optional_weight.h"  // for MakeOptionalWeights
+#include "../common/transform.h"
 #include "xgboost/data.h"
+#include "xgboost/json.h"
 #include "xgboost/logging.h"
 #include "xgboost/objective.h"
-#include "xgboost/json.h"
 
-#include "../common/common.h"
-#include "../common/math.h"
-#include "../common/transform.h"
+#if defined(XGBOOST_USE_CUDA)
 
-#include "multiclass_param.h"
+#include "../common/algorithm.cuh"     // for AllOf
+#include "../common/cuda_context.cuh"  // for CUDAContext
+#include "../common/linalg_op.cuh"     // for tcbegin
+
+#endif  // defined(XGBOOST_USE_CUDA)
 
-namespace xgboost {
-namespace obj {
+#if defined(XGBOOST_USE_SYCL)
+#include "../../plugin/sycl/common/linalg_op.h"
+#endif
 
+#include "multiclass_param.h"
+
+namespace xgboost::obj {
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(multiclass_obj_gpu);
 #endif  // defined(XGBOOST_USE_CUDA)
 
+namespace {
+void ValidateLabel(Context const* ctx, MetaInfo const& info, std::int64_t n_classes) {
+  auto label = info.labels.View(ctx->Device());
+  CHECK_EQ(label.Shape(1), 1) << "multi-class-multi-label is not yet supported.";
+  auto check = [=] XGBOOST_DEVICE(float y) -> bool {
+    return y >= 0 && y < n_classes && std::floor(y) == y;
+  };
+  auto valid = ctx->DispatchDevice(
+      [&] { return std::all_of(linalg::cbegin(label), linalg::cend(label), check); },
+      [&] {
+#if defined(XGBOOST_USE_CUDA)
+        return common::AllOf(ctx->CUDACtx()->CTP(), linalg::tcbegin(label), linalg::tcend(label),
+                             check);
+#else
+        common::AssertGPUSupport();
+        return false;
+#endif  // defined(XGBOOST_USE_CUDA)
+      });
+  CHECK(valid)
+      << "SoftmaxMultiClassObj: label must be discrete values in the range of [0, num_class).";
+}
+}  // namespace
+
 class SoftmaxMultiClassObj : public ObjFunction {
  public:
-  explicit SoftmaxMultiClassObj(bool output_prob)
-  : output_prob_(output_prob) {}
+  explicit SoftmaxMultiClassObj(bool output_prob) : output_prob_(output_prob) {}
 
-  void Configure(Args const& args) override {
-    param_.UpdateAllowUnknown(args);
-  }
+  void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
 
   ObjInfo Task() const override { return ObjInfo::kClassification; }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, std::int32_t,
+  void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info, std::int32_t iter,
                    linalg::Matrix<GradientPair>* out_gpair) override {
     if (info.labels.Size() == 0) {
       return;
     }
-    CHECK(preds.Size() == (static_cast<size_t>(param_.num_class) * info.labels.Size()))
+    std::int64_t n_classes = param_.num_class;
+    CHECK(preds.Size() == (static_cast<std::size_t>(n_classes) * info.labels.Size()))
         << "SoftmaxMultiClassObj: label size and pred size does not match.\n"
-        << "label.Size() * num_class: "
-        << info.labels.Size() * static_cast<size_t>(param_.num_class) << "\n"
+        << "label.Size() * num_class: " << info.labels.Size() * n_classes << "\n"
         << "num_class: " << param_.num_class << "\n"
         << "preds.Size(): " << preds.Size();
 
-    const int nclass = param_.num_class;
-    const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
+    if (iter == 0) {
+      ValidateLabel(this->ctx_, info, n_classes);
+    }
 
-    auto device = ctx_->DeviceFP64();
-    out_gpair->SetDevice(device);
-    info.labels.SetDevice(device);
-    info.weights_.SetDevice(device);
-    preds.SetDevice(device);
+    const auto n_samples = preds.Size() / n_classes;
+    CHECK_EQ(n_samples, info.num_row_);
 
-    label_correct_.Resize(1);
-    label_correct_.SetDevice(device);
+    auto device = ctx_->Device();
+    auto labels = info.labels.View(ctx_->Device());
 
-    out_gpair->Reshape(info.num_row_, static_cast<std::uint64_t>(nclass));
-    label_correct_.Fill(1);
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, n_classes);
+    auto gpair = out_gpair->View(ctx_->Device());
 
-    const bool is_null_weight = info.weights_.Size() == 0;
-    if (!is_null_weight) {
-      CHECK_EQ(info.weights_.Size(), ndata)
+    if (!info.weights_.Empty()) {
+      CHECK_EQ(info.weights_.Size(), n_samples)
           << "Number of weights should be equal to number of data points.";
     }
+    info.weights_.SetDevice(device);
+    auto weights = common::MakeOptionalWeights(this->ctx_, info.weights_);
 
-    common::Transform<>::Init(
-        [=] XGBOOST_DEVICE(size_t idx,
-                           common::Span<GradientPair> gpair,
-                           common::Span<bst_float const> labels,
-                           common::Span<bst_float const> preds,
-                           common::Span<bst_float const> weights,
-                           common::Span<int> _label_correct) {
-          common::Span<bst_float const> point = preds.subspan(idx * nclass, nclass);
-
-          // Part of Softmax function
-          bst_float wmax = std::numeric_limits<bst_float>::min();
-          for (auto const i : point) { wmax = fmaxf(i, wmax); }
-          double wsum = 0.0f;
-          for (auto const i : point) { wsum += expf(i - wmax); }
-          auto label = labels[idx];
-          if (label < 0 || label >= nclass) {
-            _label_correct[0] = 0;
-            label = 0;
-          }
-          bst_float wt = is_null_weight ? 1.0f : weights[idx];
-          for (int k = 0; k < nclass; ++k) {
-            // Computation duplicated to avoid creating a cache.
-            bst_float p = expf(point[k] - wmax) / static_cast<float>(wsum);
-            const float eps = 1e-16f;
-            const bst_float h = fmax(2.0f * p * (1.0f - p) * wt, eps);
-            p = label == k ? p - 1.0f : p;
-            gpair[idx * nclass + k] = GradientPair(p * wt, h);
-          }
-        }, common::Range{0, ndata}, ctx_->Threads(), device)
-        .Eval(out_gpair->Data(), info.labels.Data(), &preds, &info.weights_, &label_correct_);
-
-    std::vector<int>& label_correct_h = label_correct_.HostVector();
-    for (auto const flag : label_correct_h) {
-      if (flag != 1) {
-        LOG(FATAL) << "SoftmaxMultiClassObj: label must be in [0, num_class).";
+    preds.SetDevice(device);
+    auto predt = linalg::MakeTensorView(this->ctx_, &preds, n_samples, n_classes);
+
+    CHECK_EQ(labels.Shape(1), 1);
+    auto y1d = labels.Slice(linalg::All(), 0);
+    CHECK_EQ(y1d.Shape(0), info.num_row_);
+    linalg::ElementWiseKernel(this->ctx_, y1d, [=] XGBOOST_DEVICE(std::size_t idx) mutable {
+      auto point = predt.Slice(idx, linalg::All());
+      assert(point.Size() == static_cast<std::size_t>(n_classes));
+
+      // Part of the common::Softmax function
+      float wmax = std::numeric_limits<float>::min();
+      for (std::size_t k = 0, m = point.Size(); k < m; ++k) {
+        wmax = fmaxf(point(k), wmax);
       }
-    }
+      double wsum = 0.0f;
+      for (std::size_t k = 0, m = point.Size(); k < m; ++k) {
+        wsum += expf(point(k) - wmax);
+      }
+      auto label = y1d(idx);
+
+      float wt = weights[idx];
+      for (decltype(n_classes) k = 0; k < n_classes; ++k) {
+        // Computation duplicated to avoid creating a cache.
+        float p = expf(point(k) - wmax) / static_cast<float>(wsum);
+        constexpr float kEps = 1e-16f;
+        float h = fmax(2.0f * p * (1.0f - p) * wt, kEps);
+        p = label == k ? p - 1.0f : p;
+        gpair(idx, k) = GradientPair{p * wt, h};
+      }
+    });
   }
-  void PredTransform(HostDeviceVector<bst_float>* io_preds) const override {
+
+  void PredTransform(HostDeviceVector<float>* io_preds) const override {
     this->Transform(io_preds, output_prob_);
   }
-  void EvalTransform(HostDeviceVector<bst_float>* io_preds) override {
+  void EvalTransform(HostDeviceVector<float>* io_preds) override {
     this->Transform(io_preds, true);
   }
-  const char* DefaultEvalMetric() const override {
-    return "mlogloss";
-  }
+  const char* DefaultEvalMetric() const override { return "mlogloss"; }
 
-  inline void Transform(HostDeviceVector<bst_float> *io_preds, bool prob) const {
-    const int nclass = param_.num_class;
-    const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
+  void Transform(HostDeviceVector<float>* io_preds, bool prob) const {
+    const int n_classes = param_.num_class;
+    const auto n_samples = static_cast<int64_t>(io_preds->Size() / n_classes);
 
     auto device = io_preds->Device();
     if (prob) {
       common::Transform<>::Init(
-          [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
-            common::Span<bst_float> point =
-                _preds.subspan(_idx * nclass, nclass);
+          [=] XGBOOST_DEVICE(size_t _idx, common::Span<float> _preds) {
+            common::Span<float> point = _preds.subspan(_idx * n_classes, n_classes);
             common::Softmax(point.begin(), point.end());
           },
-          common::Range{0, ndata}, this->ctx_->Threads(), device)
+          common::Range{0, n_samples}, this->ctx_->Threads(), device)
           .Eval(io_preds);
     } else {
       io_preds->SetDevice(device);
-      HostDeviceVector<bst_float> max_preds;
+      HostDeviceVector<float> max_preds;
       max_preds.SetDevice(device);
-      max_preds.Resize(ndata);
+      max_preds.Resize(n_samples);
       common::Transform<>::Init(
-          [=] XGBOOST_DEVICE(size_t _idx, common::Span<const bst_float> _preds,
-                             common::Span<bst_float> _max_preds) {
-            common::Span<const bst_float> point =
-                _preds.subspan(_idx * nclass, nclass);
-            _max_preds[_idx] =
-                common::FindMaxIndex(point.cbegin(), point.cend()) -
-                point.cbegin();
+          [=] XGBOOST_DEVICE(size_t _idx, common::Span<const float> _preds,
+                             common::Span<float> _max_preds) {
+            common::Span<const float> point = _preds.subspan(_idx * n_classes, n_classes);
+            _max_preds[_idx] = common::FindMaxIndex(point.cbegin(), point.cend()) - point.cbegin();
           },
-          common::Range{0, ndata}, this->ctx_->Threads(), device)
+          common::Range{0, n_samples}, this->ctx_->Threads(), device)
           .Eval(io_preds, &max_preds);
       io_preds->Resize(max_preds.Size());
       io_preds->Copy(max_preds);
@@ -167,29 +184,23 @@ class SoftmaxMultiClassObj : public ObjFunction {
     out["softmax_multiclass_param"] = ToJson(param_);
   }
 
-  void LoadConfig(Json const& in) override {
-    FromJson(in["softmax_multiclass_param"], &param_);
-  }
+  void LoadConfig(Json const& in) override { FromJson(in["softmax_multiclass_param"], &param_); }
 
  private:
   // output probability
-  bool output_prob_;
+  bool const output_prob_;
   // parameter
   SoftmaxMultiClassParam param_;
-  // Cache for max_preds
-  HostDeviceVector<int> label_correct_;
 };
 
 // register the objective functions
 DMLC_REGISTER_PARAMETER(SoftmaxMultiClassParam);
 
 XGBOOST_REGISTER_OBJECTIVE(SoftmaxMultiClass, "multi:softmax")
-.describe("Softmax for multi-class classification, output class index.")
-.set_body([]() { return new SoftmaxMultiClassObj(false); });
+    .describe("Softmax for multi-class classification, output class index.")
+    .set_body([]() { return new SoftmaxMultiClassObj(false); });
 
 XGBOOST_REGISTER_OBJECTIVE(SoftprobMultiClass, "multi:softprob")
-.describe("Softmax for multi-class classification, output probability distribution.")
-.set_body([]() { return new SoftmaxMultiClassObj(true); });
-
-}  // namespace obj
-}  // namespace xgboost
+    .describe("Softmax for multi-class classification, output probability distribution.")
+    .set_body([]() { return new SoftmaxMultiClassObj(true); });
+}  // namespace xgboost::obj
diff --git a/src/objective/multiclass_param.h b/src/objective/multiclass_param.h
index d1dea15fd0d4..b25f38f29732 100644
--- a/src/objective/multiclass_param.h
+++ b/src/objective/multiclass_param.h
@@ -1,25 +1,21 @@
-/*!
- * Copyright 2015-2023 by Contributors
- * \file multiclass_param.h
- * \brief Definition of multi-class classification parameters.
+/**
+ * Copyright 2015-2025, XGBoost Contributors
+ *
+ * @brief Definition of multi-class classification parameters.
  */
 #ifndef XGBOOST_OBJECTIVE_MULTICLASS_PARAM_H_
 #define XGBOOST_OBJECTIVE_MULTICLASS_PARAM_H_
 
 #include "xgboost/parameter.h"
 
-namespace xgboost {
-namespace obj {
-
+namespace xgboost::obj {
 struct SoftmaxMultiClassParam : public XGBoostParameter<SoftmaxMultiClassParam> {
-  int num_class;
+  int num_class{1};
   // declare parameters
   DMLC_DECLARE_PARAMETER(SoftmaxMultiClassParam) {
-    DMLC_DECLARE_FIELD(num_class).set_lower_bound(1)
-        .describe("Number of output class in the multi-class classification.");
+    DMLC_DECLARE_FIELD(num_class).set_lower_bound(1).describe(
+        "Number of output class in the multi-class classification.");
   }
 };
-
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
 #endif  // XGBOOST_OBJECTIVE_MULTICLASS_PARAM_H_
diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc
index e06804b84c75..20260b893c38 100644
--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@@ -50,7 +50,7 @@ class TestDefaultObjConfig : public ::testing::TestWithParam<std::string> {
 
  public:
   void Run(std::string objective) {
-    auto Xy = MakeFmatForObjTest(objective, 10, 10);
+    auto Xy = MakeFmatForObjTest(objective, 10, 10, 3);
     std::unique_ptr<Learner> learner{Learner::Create({Xy})};
     std::unique_ptr<ObjFunction> objfn{ObjFunction::Create(objective, &ctx_)};
 
diff --git a/tests/cpp/objective_helpers.cc b/tests/cpp/objective_helpers.cc
index 9ad4b5c39688..b037b160fb9d 100644
--- a/tests/cpp/objective_helpers.cc
+++ b/tests/cpp/objective_helpers.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #include "objective_helpers.h"
 
@@ -28,10 +28,21 @@ void MakeLabelForObjTest(std::shared_ptr<DMatrix> p_fmat, std::string const& obj
   }
 }
 
-std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj, bst_idx_t n_samples,
-                                            bst_feature_t n_features) {
-  auto p_fmat = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  MakeLabelForObjTest(p_fmat, obj);
+[[nodiscard]] std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj,
+                                                          bst_idx_t n_samples,
+                                                          bst_feature_t n_features,
+                                                          bst_target_t n_classes, bool make_label) {
+  std::shared_ptr<DMatrix> p_fmat;
+  if (obj.find("multi:") != std::string::npos) {
+    CHECK_GE(n_classes, 3);
+    p_fmat = RandomDataGenerator{n_samples, n_features, 0}.Classes(n_classes).GenerateDMatrix(
+        make_label);
+  } else {
+    p_fmat = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(make_label);
+  }
+  if (make_label) {
+    MakeLabelForObjTest(p_fmat, obj);
+  }
   return p_fmat;
-};
+}
 }  // namespace xgboost
diff --git a/tests/cpp/objective_helpers.h b/tests/cpp/objective_helpers.h
index 972747c36e21..6e41ecd8e185 100644
--- a/tests/cpp/objective_helpers.h
+++ b/tests/cpp/objective_helpers.h
@@ -37,6 +37,9 @@ inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType
  */
 void MakeLabelForObjTest(std::shared_ptr<DMatrix> p_fmat, std::string const& obj);
 
-std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj, bst_idx_t n_samples,
-                                            bst_feature_t n_features);
+[[nodiscard]] std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj,
+                                                          bst_idx_t n_samples,
+                                                          bst_feature_t n_features,
+                                                          bst_target_t n_classes,
+                                                          bool make_label = true);
 }  // namespace xgboost
diff --git a/tests/cpp/plugin/federated/test_federated_learner.cc b/tests/cpp/plugin/federated/test_federated_learner.cc
index ed0bbcb3b749..6dd641bc8a21 100644
--- a/tests/cpp/plugin/federated/test_federated_learner.cc
+++ b/tests/cpp/plugin/federated/test_federated_learner.cc
@@ -16,6 +16,7 @@
 
 namespace xgboost {
 namespace {
+inline constexpr bst_target_t kClassesForTest = 3;
 auto MakeModel(std::string tree_method, std::string device, std::string objective,
                std::shared_ptr<DMatrix> dmat) {
   std::unique_ptr<Learner> learner{Learner::Create({dmat})};
@@ -26,7 +27,7 @@ auto MakeModel(std::string tree_method, std::string device, std::string objectiv
     learner->SetParam("quantile_alpha", "0.5");
   }
   if (objective.find("multi") != std::string::npos) {
-    learner->SetParam("num_class", "3");
+    learner->SetParam("num_class", std::to_string(kClassesForTest));
   }
   learner->UpdateOneIter(0, dmat);
   Json config{Object{}};
@@ -41,11 +42,8 @@ void VerifyObjective(std::size_t rows, std::size_t cols, float expected_base_sco
                      Json expected_model, std::string const &tree_method, std::string device,
                      std::string const &objective) {
   auto rank = collective::GetRank();
-  std::shared_ptr<DMatrix> dmat{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
-
-  if (rank == 0) {
-    MakeLabelForObjTest(dmat, objective);
-  }
+  std::shared_ptr<DMatrix> dmat =
+      MakeFmatForObjTest(objective, rows, cols, kClassesForTest, rank == 0);
   std::shared_ptr<DMatrix> sliced{dmat->SliceCol(collective::GetWorldSize(), rank)};
 
   auto model = MakeModel(tree_method, device, objective, sliced);
@@ -63,26 +61,7 @@ class VerticalFederatedLearnerTest : public ::testing::TestWithParam<std::string
     static auto constexpr kRows{16};
     static auto constexpr kCols{16};
 
-    std::shared_ptr<DMatrix> dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
-    MakeLabelForObjTest(dmat, objective);
-
-    auto &h_upper = dmat->Info().labels_upper_bound_.HostVector();
-    auto &h_lower = dmat->Info().labels_lower_bound_.HostVector();
-    h_lower.resize(kRows);
-    h_upper.resize(kRows);
-    for (size_t i = 0; i < kRows; ++i) {
-      h_lower[i] = 1;
-      h_upper[i] = 10;
-    }
-    if (objective.find("rank:") != std::string::npos) {
-      auto h_label = dmat->Info().labels.HostView();
-      std::size_t k = 0;
-      for (auto &v : h_label) {
-        v = k % 2 == 0;
-        ++k;
-      }
-    }
-
+    auto dmat = MakeFmatForObjTest(objective, kRows, kCols, kClassesForTest);
     auto model = MakeModel(tree_method, device, objective, dmat);
     auto score = GetBaseScore(model);
     collective::TestFederatedGlobal(kWorldSize, [&]() {
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index b2590d18bd64..5c4c3d62950d 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -615,7 +615,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
     auto n_threads = collective::GetWorkerLocalThreads(world_size);
     auto const rank = collective::GetRank();
 
-    auto p_fmat = MakeFmatForObjTest(objective, 10, 10);
+    std::shared_ptr<DMatrix> p_fmat = MakeFmatForObjTest(objective, 10, 10, 3);
     std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
     std::unique_ptr<Learner> learner{Learner::Create({sliced})};
     learner->SetParams(Args{{"nthread", std::to_string(n_threads)},
@@ -640,7 +640,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
 
  public:
   void Run(std::string objective) {
-    auto p_fmat = MakeFmatForObjTest(objective, 10, 10);
+    std::shared_ptr<DMatrix> p_fmat = MakeFmatForObjTest(objective, 10, 10, 3);
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
     learner->SetParam("tree_method", "approx");
     learner->SetParam("objective", objective);
@@ -663,9 +663,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
       this->TestBaseScore(objective, args...);
     };
     auto score = GetBaseScore(config);
-    collective::TestDistributedGlobal(kWorldSize, [&] {
-      call(score, model);
-    });
+    collective::TestDistributedGlobal(kWorldSize, [&] { call(score, model); });
   }
 };
 
@@ -700,7 +698,7 @@ void VerifyColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Arg
                                Json const& expected_model) {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
-  auto p_fmat = MakeFmatForObjTest("", 10, 10);
+  auto p_fmat = MakeFmatForObjTest("", 10, 10, 0);
   std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
   std::string device = "cpu";
   if (use_gpu) {
@@ -712,7 +710,7 @@ void VerifyColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Arg
 
 void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args const& args,
                              bool federated) {
-  auto p_fmat = MakeFmatForObjTest("", 10, 10);
+  auto p_fmat = MakeFmatForObjTest("", 10, 10, 0);
   std::string device = use_gpu ? "cuda:0" : "cpu";
   auto model = GetModelWithArgs(p_fmat, tree_method, device, args);
 
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 38249fc3247d..eb51f9ad7a60 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -362,6 +362,7 @@ def test_shap(self, num_rounds: int, dataset: tm.TestDataset, param: dict) -> No
         strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy
     )
     @settings(deadline=None, max_examples=10, print_blob=True)
+    @pytest.mark.timeout(90)
     def test_shap_interactions(
         self, num_rounds: int, dataset: tm.TestDataset, param: dict
     ) -> None:

From 12df100f2620d3e8b25200b9b9d10a4e97cb2bcc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 29 Aug 2025 15:12:02 +0800
Subject: [PATCH 155/224] [EM] Fix race in the iterator accessor policy.
 (#11664)

---
 src/data/sparse_page_source.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index f311cbb48a43..931da303316f 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -319,11 +319,11 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
       if (restart) {
         this->param_.prefetch_copy = true;
       }
-      ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, this] {
+      auto p = this->param_;
+      ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, p, this] {
         auto page = std::make_shared<S>();
         this->exce_.Run([&] {
-          std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{
-              self->CreatePageFormat(self->param_)};
+          std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{self->CreatePageFormat(p)};
           auto name = self->cache_info_->ShardName();
           auto [offset, length] = self->cache_info_->View(fetch_it);
           std::unique_ptr<typename FormatStreamPolicy::ReaderT> fi{

From b245b905ed8c625c46d1516af5d518e7b47b3b4f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 29 Aug 2025 15:14:40 +0800
Subject: [PATCH 156/224] [enc][dask] Avoid `to_backend` call. (#11665)

To make sure the data is partitioned as required.
---
 python-package/xgboost/testing/dask.py                   | 5 -----
 .../test_gpu_with_dask/test_gpu_with_dask.py             | 9 ++++++++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index 8eb63d52d8eb..84b8988fd41c 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -348,11 +348,6 @@ def run(DMatrixT: Type[dxgb.DaskDMatrix]) -> None:
             da.from_array(y, chunks=(y.shape[0] // 8,)).persist(workers=to),
         )
 
-        if device == "cuda":
-            denc = denc.to_backend("cudf")
-            dreenc = dreenc.to_backend("cudf")
-            dy = dy.to_backend("cupy")
-
         Xy = create_dmatrix(DMatrixT, client, denc, dy, enable_categorical=True)
         Xy_valid = create_dmatrix(
             DMatrixT, client, dreenc, dy, enable_categorical=True, ref=Xy
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 91cf90fb6d5f..d0c4f446b593 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -43,6 +43,7 @@
 
 try:
     import cudf
+    import dask
     import dask.dataframe as dd
     from dask import __version__ as dask_version
     from dask import array as da
@@ -596,7 +597,13 @@ def test_categorical(local_cuda_client: Client) -> None:
 
 @pytest.mark.skipif(**tm.no_dask_cudf())
 def test_recode(local_cuda_client: Client) -> None:
-    run_recode(local_cuda_client, "cuda")
+    with dask.config.set(
+        {
+            "array.backend": "cupy",
+            "dataframe.backend": "cudf",
+        }
+    ):
+        run_recode(local_cuda_client, "cuda")
 
 
 @pytest.mark.skipif(**tm.no_cupy())

From 79eef6a9604e998333b78f6f0b3a2bf234a24c77 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 1 Sep 2025 01:46:12 +0800
Subject: [PATCH 157/224] Cleanup IO functions and add IO for GPU QDM. (#11667)

---
 include/xgboost/collective/socket.h           |   4 +-
 src/collective/socket.cc                      |   2 +-
 src/common/hist_util.cc                       |  24 +++-
 src/common/hist_util.h                        |   6 +
 src/common/io.cc                              | 108 ++++++++++++++----
 src/common/io.h                               |  42 +------
 src/common/resource.cu                        |  16 +--
 src/data/ellpack_page.cuh                     |  10 +-
 src/data/iterative_dmatrix.cc                 |  39 ++++---
 src/data/iterative_dmatrix.cu                 |  40 +++++--
 src/data/iterative_dmatrix.h                  |  32 ++++--
 src/data/proxy_dmatrix.cc                     |   6 +-
 src/tree/tree_model.cc                        |   4 +-
 tests/cpp/common/test_config.cc               |  14 +--
 tests/cpp/common/test_hist_util.cc            |   2 +-
 tests/cpp/common/test_hist_util.cu            |   8 +-
 tests/cpp/common/test_io.cc                   |  16 +--
 tests/cpp/common/test_json.cc                 |  10 +-
 tests/cpp/common/test_numa_topo.cc            |   8 +-
 tests/cpp/common/test_version.cc              |   8 +-
 tests/cpp/data/test_data.cc                   |  13 +--
 .../cpp/data/test_ellpack_page_raw_format.cu  |   6 +-
 tests/cpp/data/test_file_iterator.cc          |  10 +-
 .../test_gradient_index_page_raw_format.cc    |   6 +-
 tests/cpp/data/test_iterative_dmatrix.cu      |  54 +++++++--
 tests/cpp/data/test_metainfo.cc               |  12 +-
 tests/cpp/data/test_simple_dmatrix.cc         |  24 ++--
 tests/cpp/data/test_sparse_page_dmatrix.cc    |  16 +--
 tests/cpp/data/test_sparse_page_raw_format.cc |   8 +-
 tests/cpp/filesystem.cc                       |  74 ++++++++++++
 tests/cpp/filesystem.h                        |  20 +++-
 tests/cpp/gbm/test_gbtree.cc                  |   6 +-
 tests/cpp/helpers.cc                          |  18 ++-
 tests/cpp/helpers.h                           |  17 +--
 .../plugin/federated/test_federated_data.cc   |  12 +-
 tests/cpp/test_helpers.cc                     |   8 +-
 tests/cpp/test_learner.cc                     |   6 +-
 tests/cpp/test_serialization.cc               |   8 +-
 38 files changed, 467 insertions(+), 250 deletions(-)
 create mode 100644 tests/cpp/filesystem.cc

diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index 6b4d846cf727..f0a21dd1bb06 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #pragma once
 
@@ -301,7 +301,7 @@ class TCPSocket {
     };
 
 #if defined(_WIN32)
-    WSAPROTOCOL_INFOA info;
+    WSAPROTOCOL_INFOW info;
     socklen_t len = sizeof(info);
     xgboost_CHECK_SYS_CALL(
         getsockopt(handle_, SOL_SOCKET, SO_PROTOCOL_INFO, reinterpret_cast<char *>(&info), &len),
diff --git a/src/collective/socket.cc b/src/collective/socket.cc
index e37648c8edd3..7ae39cbc2365 100644
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #include "xgboost/collective/socket.h"
 
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index dfd80cb68c13..501a308465e0 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost Contributors
+ * Copyright 2017-2025, XGBoost Contributors
  * \file hist_util.cc
  */
 #include "hist_util.h"
@@ -10,6 +10,7 @@
 
 #include "../data/adapter.h"         // for SparsePageAdapterBatch
 #include "../data/gradient_index.h"  // for GHistIndexMatrix
+#include "io.h"                      // for AlignedResourceReadStream, AlignedFileWriteStream
 #include "quantile.h"
 #include "xgboost/base.h"
 #include "xgboost/context.h"  // for Context
@@ -29,6 +30,27 @@ HistogramCuts::HistogramCuts() {
   cut_ptrs_.HostVector().emplace_back(0);
 }
 
+void HistogramCuts::Save(common::AlignedFileWriteStream *fo) const {
+  auto const &ptrs = this->Ptrs();
+  CHECK_LE(Span{ptrs}.size_bytes(), WriteVec(fo, ptrs));
+  auto const &vals = this->Values();
+  CHECK_LE(Span{vals}.size_bytes(), WriteVec(fo, vals));
+  auto const &mins = this->MinValues();
+  CHECK_LE(Span{mins}.size_bytes(), WriteVec(fo, mins));
+  CHECK_GE(fo->Write(has_categorical_), sizeof(has_categorical_));
+  CHECK_GE(fo->Write(max_cat_), sizeof(max_cat_));
+}
+
+[[nodiscard]] HistogramCuts *HistogramCuts::Load(common::AlignedResourceReadStream *fi) {
+  auto p_cuts = new HistogramCuts;
+  CHECK(ReadVec(fi, &p_cuts->cut_ptrs_.HostVector()));
+  CHECK(ReadVec(fi, &p_cuts->cut_values_.HostVector()));
+  CHECK(ReadVec(fi, &p_cuts->min_vals_.HostVector()));
+  CHECK(fi->Read(&p_cuts->has_categorical_));
+  CHECK(fi->Read(&p_cuts->max_cat_));
+  return p_cuts;
+}
+
 HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
                               Span<float const> hessian) {
   HistogramCuts out;
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index dc2bc3fd6a89..ce0a118af7fd 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -24,6 +24,9 @@ namespace xgboost {
 class GHistIndexMatrix;
 
 namespace common {
+class AlignedFileWriteStream;
+class AlignedResourceReadStream;
+
 /*!
  * \brief A single row in global histogram index.
  *  Directly represent the global index in the histogram entry.
@@ -175,6 +178,9 @@ class HistogramCuts {
     this->min_vals_.SetDevice(d);
     this->min_vals_.ConstDevicePointer();
   }
+
+  void Save(common::AlignedFileWriteStream* fo) const;
+  [[nodiscard]] static HistogramCuts* Load(common::AlignedResourceReadStream* fi);
 };
 
 /**
diff --git a/src/common/io.cc b/src/common/io.cc
index f187d54e0e88..0b2f34b44a30 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -13,14 +13,15 @@
 #include <xgboost/windefs.h>
 
 #if defined(xgboost_IS_WIN)
-#include <windows.h>
+
+#include <windows.h>  // for CreateFileMapping2, CreateFileEx...
+
 #endif  // defined(xgboost_IS_WIN)
 
 #endif  // defined(__unix__) || defined(__APPLE__)
 
 #include <algorithm>     // for copy, transform
 #include <cctype>        // for tolower
-#include <cerrno>        // for errno
 #include <cstddef>       // for size_t
 #include <cstdint>       // for int32_t, uint32_t
 #include <cstdio>        // for fread, fseek
@@ -174,6 +175,57 @@ std::string FileExtension(std::string fname, bool lower) {
   }
 }
 
+struct MmapFileImpl {
+#if defined(xgboost_IS_WIN)
+  HANDLE fd{INVALID_HANDLE_VALUE};
+  HANDLE file_map{INVALID_HANDLE_VALUE};
+#else
+  std::int32_t fd{0};
+#endif  // defined(xgboost_IS_WIN)
+  std::byte* base_ptr{nullptr};
+  std::size_t base_size{0};
+  std::size_t delta{0};
+  std::string path;
+
+  MmapFileImpl() = default;
+
+#if defined(xgboost_IS_WIN)
+  MmapFileImpl(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+               std::string path)
+      : fd{fd},
+        file_map{fm},
+        base_ptr{base_ptr},
+        base_size{base_size},
+        delta{delta},
+        path{std::move(path)} {}
+#else
+  MmapFileImpl(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+               std::string path)
+      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
+#endif  // defined(xgboost_IS_WIN)
+
+  void const* Data() const { return this->base_ptr + this->delta; }
+  void* Data() { return this->base_ptr + this->delta; }
+};
+
+void const* MMAPFile::Data() const {
+  if (!this->p_impl) {
+    return nullptr;
+  }
+  return this->p_impl->Data();
+}
+
+void* MMAPFile::Data() {
+  if (!this->p_impl) {
+    return nullptr;
+  }
+  return this->p_impl->Data();
+}
+
+[[nodiscard]] Span<std::byte> MMAPFile::BasePtr() const {
+  return Span{this->p_impl->base_ptr, this->p_impl->base_size};
+}
+
 // For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
 // NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
 ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
@@ -202,25 +254,29 @@ MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t len
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << error::SystemError().message();
-  auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
+  auto handle = new MMAPFile{
+      std::make_unique<MmapFileImpl>(fd, ptr, view_size, offset - view_start, std::move(path))};
 #elif defined(xgboost_IS_WIN)
-  auto file_size = GetFileSize(fd, nullptr);
-  DWORD access = PAGE_READONLY;
-  auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
-  access = FILE_MAP_READ;
-  std::uint32_t loff = static_cast<std::uint32_t>(view_start);
-  std::uint32_t hoff = view_start >> 32;
+  LARGE_INTEGER file_size;
+  CHECK_NE(GetFileSizeEx(fd, &file_size), 0) << error::SystemError().message();
+  auto map_file = CreateFileMappingA(fd, nullptr, PAGE_READONLY, file_size.HighPart,
+                                     file_size.LowPart, nullptr);
   CHECK(map_file) << "Failed to map: " << path << ". " << error::SystemError().message();
-  ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
+
+  auto li_vs = reinterpret_cast<LARGE_INTEGER*>(&view_start);
+  ptr = reinterpret_cast<std::byte*>(
+      MapViewOfFile(map_file, FILE_MAP_READ, li_vs->HighPart, li_vs->LowPart, view_size));
   CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << error::SystemError().message();
-  auto handle = new MMAPFile{fd, map_file, ptr, view_size, offset - view_start, std::move(path)};
+  auto handle = new MMAPFile{std::make_unique<MmapFileImpl>(fd, map_file, ptr, view_size,
+                                                            offset - view_start, std::move(path))};
 #else
   CHECK_LE(offset, std::numeric_limits<off_t>::max())
       << "File size has exceeded the limit on the current system.";
   int prot{PROT_READ};
   ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << error::SystemError().message();
-  auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
+  auto handle = new MMAPFile{
+      std::make_unique<MmapFileImpl>(fd, ptr, view_size, offset - view_start, std::move(path))};
 #endif  // defined(__linux__) || defined(__GLIBC__)
 
   return handle;
@@ -231,25 +287,27 @@ void detail::CloseMmap(MMAPFile* handle) {
     return;
   }
 #if defined(xgboost_IS_WIN)
-  if (handle->base_ptr) {
-    CHECK(UnmapViewOfFile(handle->base_ptr))
+  if (handle->p_impl->base_ptr) {
+    CHECK(UnmapViewOfFile(handle->p_impl->base_ptr))
         << "Failed to call munmap: " << error::SystemError().message();
   }
-  if (handle->fd != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << error::SystemError().message();
+  if (handle->p_impl->fd != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle->p_impl->fd))
+        << "Failed to close handle: " << error::SystemError().message();
   }
-  if (handle->file_map != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle->file_map))
+  if (handle->p_impl->file_map != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle->p_impl->file_map))
         << "Failed to close mapping object: " << error::SystemError().message();
   }
 #else
-  if (handle->base_ptr) {
-    CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1)
-        << "Failed to call munmap: `" << handle->path << "`. " << error::SystemError().message();
+  if (handle->p_impl->base_ptr) {
+    CHECK_NE(munmap(handle->p_impl->base_ptr, handle->p_impl->base_size), -1)
+        << "Failed to call munmap: `" << handle->p_impl->path << "`. "
+        << error::SystemError().message();
   }
-  if (handle->fd != 0) {
-    CHECK_NE(close(handle->fd), -1)
-        << "Failed to close: `" << handle->path << "`. " << error::SystemError().message();
+  if (handle->p_impl->fd != 0) {
+    CHECK_NE(close(handle->p_impl->fd), -1)
+        << "Failed to close: `" << handle->p_impl->path << "`. " << error::SystemError().message();
   }
 #endif
   delete handle;
@@ -260,7 +318,7 @@ MmapResource::MmapResource(StringView path, std::size_t offset, std::size_t leng
       handle_{detail::OpenMmap(std::string{path}, offset, length), detail::CloseMmap},
       n_{length} {
 #if defined(__unix__) || defined(__APPLE__)
-  madvise(handle_->base_ptr, handle_->base_size, MADV_WILLNEED);
+  madvise(handle_->p_impl->base_ptr, handle_->p_impl->base_size, MADV_WILLNEED);
 #endif  // defined(__unix__) || defined(__APPLE__)
 }
 
diff --git a/src/common/io.h b/src/common/io.h
index 5169743f65ed..51aed53945b6 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -7,12 +7,6 @@
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_
 
-#include <xgboost/windefs.h>
-
-#if defined(xgboost_IS_WIN)
-#include <windows.h>
-#endif  // defined(xgboost_IS_WIN)
-
 #include <algorithm>    // for min, fill_n, copy_n
 #include <array>        // for array
 #include <cstddef>      // for byte, size_t
@@ -230,40 +224,16 @@ inline std::string ReadAll(std::string const &path) {
   return content;
 }
 
+struct MmapFileImpl;
+
 /**
  * @brief A handle to mmap file.
  */
 struct MMAPFile {
-#if defined(xgboost_IS_WIN)
-  HANDLE fd{INVALID_HANDLE_VALUE};
-  HANDLE file_map{INVALID_HANDLE_VALUE};
-#else
-  std::int32_t fd{0};
-#endif  // defined(xgboost_IS_WIN)
-  std::byte* base_ptr{nullptr};
-  std::size_t base_size{0};
-  std::size_t delta{0};
-  std::string path;
-
-  MMAPFile() = default;
-
-#if defined(xgboost_IS_WIN)
-  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd},
-        file_map{fm},
-        base_ptr{base_ptr},
-        base_size{base_size},
-        delta{delta},
-        path{std::move(path)} {}
-#else
-  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
-#endif  // defined(xgboost_IS_WIN)
-
-  void const* Data() const { return this->base_ptr + this->delta; }
-  void* Data() { return this->base_ptr + this->delta; }
+  std::unique_ptr<MmapFileImpl> p_impl;
+  [[nodiscard]] void const* Data() const;
+  [[nodiscard]] void* Data();
+  [[nodiscard]] Span<std::byte> BasePtr() const;
 };
 
 namespace detail {
diff --git a/src/common/resource.cu b/src/common/resource.cu
index a4943e2d895c..162c1a04af80 100644
--- a/src/common/resource.cu
+++ b/src/common/resource.cu
@@ -18,6 +18,7 @@ CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::siz
               }},
       n_{length} {
   auto device = dh::CurrentDevice();
+  auto ptr = handle_->BasePtr();
 #if (CUDA_VERSION / 1000) >= 13
   cudaMemLocation loc;
   loc.type = cudaMemLocationTypeDevice;
@@ -25,18 +26,13 @@ CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::siz
 #else
   auto loc = device;
 #endif  // (CUDA_VERSION / 1000) >= 13
-  dh::safe_cuda(
-      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetReadMostly, loc));
-  dh::safe_cuda(
-      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetPreferredLocation, loc));
-  dh::safe_cuda(
-      cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetAccessedBy, loc));
+  dh::safe_cuda(cudaMemAdvise(ptr.data(), ptr.size(), cudaMemAdviseSetReadMostly, loc));
+  dh::safe_cuda(cudaMemAdvise(ptr.data(), ptr.size(), cudaMemAdviseSetPreferredLocation, loc));
+  dh::safe_cuda(cudaMemAdvise(ptr.data(), ptr.size(), cudaMemAdviseSetAccessedBy, loc));
 #if (CUDA_VERSION / 1000) >= 13
-  dh::safe_cuda(
-      cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, loc, 0, dh::DefaultStream()));
+  dh::safe_cuda(cudaMemPrefetchAsync(ptr.data(), ptr.size(), loc, 0, dh::DefaultStream()));
 #else
-  dh::safe_cuda(
-      cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, device, dh::DefaultStream()));
+  dh::safe_cuda(cudaMemPrefetchAsync(ptr.data(), ptr.size(), device, dh::DefaultStream()));
 #endif  // (CUDA_VERSION / 1000) >= 13
 }
 
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 6e45985e1fb6..95af7e4bd936 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -302,8 +302,8 @@ class EllpackPageImpl {
   /**
    * @brief Get an accessor backed by the device storage.
    */
-  [[nodiscard]] EllpackAccessor GetDeviceEllpack(
-      Context const* ctx, common::Span<FeatureType const> feature_types = {}) const;
+  EllpackAccessor GetDeviceEllpack(Context const* ctx,
+                                   common::Span<FeatureType const> feature_types = {}) const;
   /**
    * @brief Get an accessor backed by the host storage.
    *
@@ -311,9 +311,9 @@ class EllpackPageImpl {
    *
    * @return An accessor variant.
    */
-  [[nodiscard]] EllpackAccessor GetHostEllpack(
-      Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
-      common::Span<FeatureType const> feature_types = {}) const;
+  EllpackAccessor GetHostEllpack(Context const* ctx,
+                                 std::vector<common::CompressedByteT>* h_gidx_buffer,
+                                 common::Span<FeatureType const> feature_types = {}) const;
   /**
    * @brief Vistor pattern.
    *
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 888fc7c3c868..6cf6e24e8286 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -27,10 +27,10 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
                                    std::shared_ptr<DMatrix> ref, DataIterResetCallback* reset,
                                    XGDMatrixCallbackNext* next, float missing, int nthread,
                                    bst_bin_t max_bin, std::int64_t max_quantile_blocks)
-    : proxy_{proxy}, reset_{reset}, next_{next} {
-  // fetch the first batch
+    : proxy_{proxy} {
+  // The external iterator, fetch the first batch
   auto iter =
-      DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_};
+      DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset, next};
   iter.Reset();
   bool valid = iter.Next();
   CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";
@@ -43,9 +43,9 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
   if (ctx.IsCUDA()) {
-    this->InitFromCUDA(&ctx, p, max_quantile_blocks, iter_handle, missing, ref);
+    this->InitFromCUDA(&ctx, p, max_quantile_blocks, std::move(iter), missing, ref);
   } else {
-    this->InitFromCPU(&ctx, p, iter_handle, missing, ref);
+    this->InitFromCPU(&ctx, p, std::move(iter), missing, ref);
   }
 
   this->fmat_ctx_ = ctx;
@@ -57,15 +57,13 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
             << this->Info().num_col_ << ", " << this->info_.num_nonzero_ << ").";
 }
 
-void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
-                                   DataIterHandle iter_handle, float missing,
-                                   std::shared_ptr<DMatrix> ref) {
+void IterativeDMatrix::InitFromCPU(
+    Context const* ctx, BatchParam const& p,
+    DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>&& iter, float missing,
+    std::shared_ptr<DMatrix> ref) {
   DMatrixProxy* proxy = MakeProxy(proxy_);
   CHECK(proxy);
 
-  // The external iterator
-  auto iter =
-      DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_};
   common::HistogramCuts cuts;
   ExternalDataInfo ext_info;
   cpu_impl::GetDataShape(ctx, proxy, &iter, missing, &ext_info);
@@ -197,20 +195,27 @@ BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(Context const* ctx,
 }
 
 #if !defined(XGBOOST_USE_CUDA)
-inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, std::int64_t,
-                                           DataIterHandle, float, std::shared_ptr<DMatrix>) {
+void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, std::int64_t,
+                                    DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>&&,
+                                    float, std::shared_ptr<DMatrix>) {
   // silent the warning about unused variables.
   (void)(proxy_);
-  (void)(reset_);
-  (void)(next_);
   common::AssertGPUSupport();
 }
 
-inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const*,
-                                                                 BatchParam const&) {
+BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const*, BatchParam const&) {
   common::AssertGPUSupport();
   auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
   return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
 }
+
+void IterativeDMatrix::Save(common::AlignedFileWriteStream*) const {
+  LOG(FATAL) << "Not implemented";
+}
+
+IterativeDMatrix* IterativeDMatrix::Load(common::AlignedResourceReadStream*) {
+  LOG(FATAL) << "Not implemented";
+  return nullptr;
+}
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost::data
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 17303b2ed20a..4078ced6fb40 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -7,27 +7,22 @@
 #include "batch_utils.h"  // for RegenGHist, CheckParam
 #include "device_adapter.cuh"
 #include "ellpack_page.cuh"
+#include "ellpack_page_raw_format.h"  // for EllpackPageRawFormat
 #include "iterative_dmatrix.h"
 #include "proxy_dmatrix.cuh"  // for DispatchAny
 #include "proxy_dmatrix.h"    // for BatchSamples, BatchColumns
 #include "simple_batch_iterator.h"
 
 namespace xgboost::data {
-void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
-                                    std::int64_t max_quantile_blocks, DataIterHandle iter_handle,
-                                    float missing, std::shared_ptr<DMatrix> ref) {
+void IterativeDMatrix::InitFromCUDA(
+    Context const* ctx, BatchParam const& p, std::int64_t max_quantile_blocks,
+    DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>&& iter, float missing,
+    std::shared_ptr<DMatrix> ref) {
   // A handle passed to external iterator.
   DMatrixProxy* proxy = MakeProxy(proxy_);
   CHECK(proxy);
 
-  // The external iterator
-  auto iter =
-      DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_};
-
-  dh::XGBCachingDeviceAllocator<char> alloc;
-
   // Sketch for all batches.
-
   std::int32_t current_device{dh::CurrentDevice()};
   auto get_ctx = [&]() {
     Context d_ctx = (ctx->IsCUDA()) ? *ctx : Context{}.MakeCUDA(current_device);
@@ -146,4 +141,29 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
   auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
   return BatchSet<EllpackPage>(begin_iter);
 }
+
+void IterativeDMatrix::Save(common::AlignedFileWriteStream* fo) const {
+  CHECK(fo);
+  CHECK(this->ellpack_) << "Not implemented";
+  // Save cuts
+  auto const& p_cuts = this->ellpack_->Impl()->CutsShared();
+  p_cuts->Save(fo);
+  // Save ellpack
+  auto fmt =
+      std::make_unique<EllpackPageRawFormat>(p_cuts, this->Ctx()->Device(), BatchParam{}, false);
+  auto n_bytes = fmt->Write(*this->ellpack_, fo);
+  CHECK_GE(n_bytes, this->ellpack_->Impl()->MemCostBytes());
+}
+
+IterativeDMatrix* IterativeDMatrix::Load(common::AlignedResourceReadStream* fi) {
+  CHECK(fi);
+  // Load cuts
+  std::shared_ptr<common::HistogramCuts> p_cuts{common::HistogramCuts::Load(fi)};
+  // Load ellpack
+  auto fmt = std::make_unique<EllpackPageRawFormat>(p_cuts, DeviceOrd::CUDA(dh::CurrentDevice()),
+                                                    BatchParam{}, false);
+  auto ellpack = std::make_shared<EllpackPage>();
+  CHECK(fmt->Read(ellpack.get(), fi));
+  return new IterativeDMatrix{std::move(ellpack)};
+}
 }  // namespace xgboost::data
diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h
index 6205989ccaf2..ab5c13b2d63e 100644
--- a/src/data/iterative_dmatrix.h
+++ b/src/data/iterative_dmatrix.h
@@ -1,13 +1,13 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
- * \file iterative_dmatrix.h
+ * Copyright 2020-2025, XGBoost Contributors
  *
- * \brief Implementation of the higher-level `QuantileDMatrix`.
+ * @brief Implementation of the higher-level `QuantileDMatrix`.
  */
 #ifndef XGBOOST_DATA_ITERATIVE_DMATRIX_H_
 #define XGBOOST_DATA_ITERATIVE_DMATRIX_H_
 
-#include <memory>  // for shared_ptr
+#include <memory>   // for shared_ptr
+#include <utility>  // for move
 
 #include "quantile_dmatrix.h"     // for QuantileDMatrix
 #include "xgboost/base.h"         // for bst_bin_t
@@ -18,7 +18,9 @@
 namespace xgboost {
 namespace common {
 class HistogramCuts;
-}
+class AlignedFileWriteStream;
+class AlignedResourceReadStream;
+}  // namespace common
 
 namespace data {
 /**
@@ -35,14 +37,18 @@ class IterativeDMatrix : public QuantileDMatrix {
   BatchParam batch_;
 
   DMatrixHandle proxy_;
-  DataIterResetCallback *reset_;
-  XGDMatrixCallbackNext *next_;
 
   void InitFromCUDA(Context const *ctx, BatchParam const &p, std::int64_t max_quantile_blocks,
-                    DataIterHandle iter_handle, float missing, std::shared_ptr<DMatrix> ref);
-  void InitFromCPU(Context const *ctx, BatchParam const &p, DataIterHandle iter_handle,
+                    DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> &&iter,
+                    float missing, std::shared_ptr<DMatrix> ref);
+  void InitFromCPU(Context const *ctx, BatchParam const &p,
+                   DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> &&iter,
                    float missing, std::shared_ptr<DMatrix> ref);
 
+  explicit IterativeDMatrix(std::shared_ptr<EllpackPage> ellpack) : ellpack_{std::move(ellpack)} {
+    this->fmat_ctx_.UpdateAllowUnknown(Args{{"device", DeviceSym::CUDA()}});
+  }
+
  public:
   explicit IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
                             std::shared_ptr<DMatrix> ref, DataIterResetCallback *reset,
@@ -55,13 +61,15 @@ class IterativeDMatrix : public QuantileDMatrix {
 
   ~IterativeDMatrix() override = default;
 
-  bool EllpackExists() const override { return static_cast<bool>(ellpack_); }
-  bool GHistIndexExists() const override { return static_cast<bool>(ghist_); }
+  [[nodiscard]] bool EllpackExists() const override { return static_cast<bool>(ellpack_); }
+  [[nodiscard]] bool GHistIndexExists() const override { return static_cast<bool>(ghist_); }
 
   BatchSet<GHistIndexMatrix> GetGradientIndex(Context const *ctx, BatchParam const &param) override;
-
   BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
   BatchSet<ExtSparsePage> GetExtBatches(Context const *ctx, BatchParam const &param) override;
+
+  void Save(common::AlignedFileWriteStream *fo) const;
+  [[nodiscard]] static IterativeDMatrix *Load(common::AlignedResourceReadStream *fi);
 };
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index 3db7656d7e8c..3bd680be6bc0 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -25,7 +25,7 @@ void DMatrixProxy::SetColumnar(StringView data) {
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
   this->batch_ = std::move(adapter);
-  this->ctx_.Init(Args{{"device", "cpu"}});
+  this->ctx_.Init(Args{{"device", DeviceSym::CPU()}});
 }
 
 void DMatrixProxy::SetArray(StringView data) {
@@ -33,7 +33,7 @@ void DMatrixProxy::SetArray(StringView data) {
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
   this->batch_ = std::move(adapter);
-  this->ctx_.Init(Args{{"device", "cpu"}});
+  this->ctx_.Init(Args{{"device", DeviceSym::CPU()}});
 }
 
 void DMatrixProxy::SetCsr(char const *c_indptr, char const *c_indices, char const *c_values,
@@ -44,7 +44,7 @@ void DMatrixProxy::SetCsr(char const *c_indptr, char const *c_indices, char cons
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
   this->batch_ = std::move(adapter);
-  this->ctx_.Init(Args{{"device", "cpu"}});
+  this->ctx_.Init(Args{{"device", DeviceSym::CPU()}});
 }
 
 #if !defined(XGBOOST_USE_CUDA)
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 5773467b1db2..bb0b1d2b5284 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -1006,14 +1006,14 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
   for (size_t i = 0; i < nodes_.size(); ++i) {
     split_type.Set(i, static_cast<std::underlying_type_t<FeatureType>>(this->NodeSplitType(i)));
     if (this->split_types_[i] == FeatureType::kCategorical) {
-      categories_nodes.GetArray().emplace_back(i);
+      categories_nodes.GetArray().emplace_back(static_cast<std::int32_t>(i));
       auto begin = categories.Size();
       categories_segments.GetArray().emplace_back(begin);
       auto segment = this->split_categories_segments_[i];
       auto cat_bits = common::GetNodeCats(this->GetSplitCategories(), segment);
       for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
         if (cat_bits.Check(i)) {
-          categories.GetArray().emplace_back(i);
+          categories.GetArray().emplace_back(static_cast<std::int32_t>(i));
         }
       }
       size_t size = categories.Size() - begin;
diff --git a/tests/cpp/common/test_config.cc b/tests/cpp/common/test_config.cc
index 5807db8caa9a..2ce021a34c38 100644
--- a/tests/cpp/common/test_config.cc
+++ b/tests/cpp/common/test_config.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 by Contributors
+/**
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -7,7 +7,7 @@
 #include <string>
 
 #include "../../../src/common/config.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../filesystem.h"  // for TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost {
@@ -15,8 +15,8 @@ namespace common {
 
 TEST(ConfigParser, NormalizeConfigEOL) {
   // Test whether strings with NL are loaded correctly.
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/my.conf";
+  common::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.Str() + "/my.conf";
   /* Old Mac OS uses \r for line ending */
   {
     std::string const input = "foo\rbar\rdog\r";
@@ -65,8 +65,8 @@ TEST(ConfigParser, TrimWhitespace) {
 
 TEST(ConfigParser, ParseKeyValuePair) {
   // Create dummy configuration file
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/my.conf";
+  common::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.Str() + "/my.conf";
   {
     std::ofstream fp(tmp_file);
     fp << "";
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index ee115a95921a..2017df632e18 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -262,7 +262,7 @@ TEST(HistUtil, DenseCutsExternalMemory) {
   Context ctx;
   for (auto num_rows : sizes) {
     HostDeviceVector<float> x{GenerateRandom(num_rows, num_columns)};
-    dmlc::TemporaryDirectory tmpdir;
+    common::TemporaryDirectory tmpdir;
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, tmpdir);
     for (auto num_bins : bin_sizes) {
       HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 63333a121cf7..3ae9229cc9b5 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -24,7 +24,7 @@
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/simple_dmatrix.h"
 #include "../data/test_array_interface.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../filesystem.h"  // for TemporaryDirectory
 #include "../helpers.h"
 #include "test_hist_util.h"
 
@@ -334,7 +334,7 @@ TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {
   int num_columns =5;
   for (auto num_rows : sizes) {
     HostDeviceVector<float> x{GenerateRandom(num_rows, num_columns)};
-    dmlc::TemporaryDirectory temp;
+    common::TemporaryDirectory temp;
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
     for (auto num_bins : bin_sizes) {
       auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
@@ -349,7 +349,7 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
   auto bin_sizes = {2, 16, 256, 512};
   auto sizes = {100, 1000, 1500};
   int num_columns = 5;
-  dmlc::TemporaryDirectory temp;
+  common::TemporaryDirectory temp;
   for (auto num_rows : sizes) {
     HostDeviceVector<float> x{GenerateRandom(num_rows, num_columns)};
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index f407660fd80a..0d0a6900dc4e 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -8,7 +8,7 @@
 #include <numeric>  // for iota
 
 #include "../../../src/common/io.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../filesystem.h"  // TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost::common {
@@ -58,8 +58,8 @@ TEST(IO, FixedSizeStream) {
 TEST(IO, LoadSequentialFile) {
   EXPECT_THROW(LoadSequentialFile("non-exist"), dmlc::Error);
 
-  dmlc::TemporaryDirectory tempdir;
-  std::ofstream fout(tempdir.path + "test_file");
+  common::TemporaryDirectory tempdir;
+  std::ofstream fout(tempdir.Path() / "test_file");
   std::string content;
 
   // Generate a JSON file.
@@ -77,7 +77,7 @@ TEST(IO, LoadSequentialFile) {
   std::vector<char> str;
   Json::Dump(out, &str);
 
-  std::string tmpfile = tempdir.path + "/model.json";
+  std::string tmpfile = tempdir.Str() + "/model.json";
   {
     std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmpfile.c_str(), "w"));
     fo->Write(str.data(), str.size());
@@ -136,8 +136,8 @@ TEST(IO, Resource) {
 
   {
     // test mmap
-    dmlc::TemporaryDirectory tmpdir;
-    auto path = tmpdir.path + "/testfile";
+    common::TemporaryDirectory tmpdir;
+    auto path = tmpdir.Str() + "/testfile";
 
     std::ofstream fout(path, std::ios::binary);
     double val{1.0};
@@ -156,8 +156,8 @@ class TestFileStream : public ::testing::Test {
  public:
   template <typename TestStreamT>
   void Run() {
-    dmlc::TemporaryDirectory tempdir;
-    auto path = tempdir.path + "/testfile";
+    common::TemporaryDirectory tempdir;
+    auto path = tempdir.Str() + "/testfile";
 
     // The page size on Linux is usually set to 4096, while the allocation granularity on
     // the Windows machine where this test is writted is 65536. We span the test to cover
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 59068c403254..a5b803354840 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -11,7 +11,7 @@
 #include "../../../src/common/io.h"
 #include "../../../src/common/json_utils.h"
 #include "../../../src/common/threading_utils.h"  // for ParallelFor
-#include "../filesystem.h"                        // dmlc::TemporaryDirectory
+#include "../filesystem.h"                        // for TemporaryDirectory
 #include "../helpers.h"
 #include "dmlc/logging.h"
 #include "xgboost/json.h"
@@ -422,8 +422,8 @@ TEST(Json, LoadDump) {
   std::string ori_buffer = GetModelStr();
   Json origin{Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
 
-  dmlc::TemporaryDirectory tempdir;
-  auto const& path = tempdir.path + "test_model_dump";
+  common::TemporaryDirectory tempdir;
+  auto const& path = tempdir.Path() / "test_model_dump";
 
   std::string out;
   Json::Dump(origin, &out);
@@ -432,7 +432,7 @@ TEST(Json, LoadDump) {
   ASSERT_TRUE(fout);
   fout << out << std::flush;
 
-  std::vector<char> new_buffer = common::LoadSequentialFile(path);
+  std::vector<char> new_buffer = common::LoadSequentialFile(path.string());
 
   Json load_back{Json::Load(StringView(new_buffer.data(), new_buffer.size()))};
   ASSERT_EQ(load_back, origin);
diff --git a/tests/cpp/common/test_numa_topo.cc b/tests/cpp/common/test_numa_topo.cc
index 89980842832b..b0e904bf30a5 100644
--- a/tests/cpp/common/test_numa_topo.cc
+++ b/tests/cpp/common/test_numa_topo.cc
@@ -8,7 +8,7 @@
 #include <vector>      // for vector
 
 #include "../../../src/common/numa_topo.h"
-#include "../filesystem.h"
+#include "../filesystem.h"  // for TemporaryDirectory
 
 namespace xgboost::common {
 namespace {
@@ -16,8 +16,8 @@ namespace fs = std::filesystem;
 }
 
 TEST(Numa, CpuListParser) {
-  dmlc::TemporaryDirectory tmpdir;
-  auto path = fs::path{tmpdir.path} / "cpulist";
+  common::TemporaryDirectory tmpdir;
+  auto path = tmpdir.Path() / "cpulist";
   std::vector<std::int32_t> cpus;
 
   auto write = [&](auto const& cpulist) {
@@ -104,7 +104,7 @@ TEST(Numa, CpuListParser) {
     check_4cpu_case();
   }
   {
-    auto path = fs::path{tmpdir.path} / "foo";
+    auto path = tmpdir.Path() / "foo";
     testing::internal::CaptureStderr();
     ReadCpuList(path, &cpus);
     std::string output = testing::internal::GetCapturedStderr();
diff --git a/tests/cpp/common/test_version.cc b/tests/cpp/common/test_version.cc
index 802e04344645..d8008be6838b 100644
--- a/tests/cpp/common/test_version.cc
+++ b/tests/cpp/common/test_version.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2025, XGBoost contributors
  */
 #include <dmlc/io.h>
 #include <gtest/gtest.h>
@@ -19,8 +19,8 @@ TEST(Version, Basic) {
   auto triplet { Version::Load(j_ver) };
   ASSERT_TRUE(Version::Same(triplet));
 
-  dmlc::TemporaryDirectory tempdir;
-  const std::string fname = tempdir.path + "/version";
+  common::TemporaryDirectory tempdir;
+  const std::string fname = tempdir.Str() + "/version";
 
   {
     std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc
index 49e43d8340e0..a1545da53b7c 100644
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@@ -1,13 +1,12 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
-#include <fstream>
 #include <memory>
 #include <vector>
 
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../filesystem.h"  // TemporaryDirectory
 #include "../helpers.h"
 #include "xgboost/data.h"
 
@@ -116,15 +115,15 @@ TEST(DMatrix, Uri) {
   auto constexpr kRows {16};
   auto constexpr kCols {8};
 
-  dmlc::TemporaryDirectory tmpdir;
-  auto const path = tmpdir.path + "/small.csv";
-  CreateTestCSV(path, kRows, kCols);
+  common::TemporaryDirectory tmpdir;
+  auto const path = tmpdir.Path() / "small.csv";
+  CreateTestCSV(path.string(), kRows, kCols);
 
   std::unique_ptr<DMatrix> dmat;
   // FIXME(trivialfis): Enable the following test by restricting csv parser in dmlc-core.
   // EXPECT_THROW(dmat.reset(DMatrix::Load(path, false, true)), dmlc::Error);
 
-  std::string uri = path + "?format=csv";
+  std::string uri = path.string() + "?format=csv";
   dmat.reset(DMatrix::Load(uri, false));
 
   ASSERT_EQ(dmat->Info().num_col_, kCols);
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index ea9a52f571cf..a0d7405ede20 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -9,7 +9,7 @@
 #include "../../../src/data/ellpack_page_source.h"      // for EllpackFormatStreamPolicy
 #include "../../../src/tree/param.h"                    // for TrainParam
 #include "../../../src/data/batch_utils.h"              // for AutoHostRatio
-#include "../filesystem.h"                              // dmlc::TemporaryDirectory
+#include "../filesystem.h"                              // for TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost::data {
@@ -43,8 +43,8 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam<bool> {
     param.prefetch_copy = prefetch_copy;
 
     auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
-    dmlc::TemporaryDirectory tmpdir;
-    std::string path = tmpdir.path + "/ellpack.page";
+    common::TemporaryDirectory tmpdir;
+    std::string path = tmpdir.Str() + "/ellpack.page";
 
     std::shared_ptr<common::HistogramCuts const> cuts;
     for (auto const &page : m->GetBatches<EllpackPage>(&ctx, param)) {
diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc
index bd8c4b9c269d..a01eee863c06 100644
--- a/tests/cpp/data/test_file_iterator.cc
+++ b/tests/cpp/data/test_file_iterator.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -9,7 +9,7 @@
 #include "../../../src/data/adapter.h"
 #include "../../../src/data/file_iterator.h"
 #include "../../../src/data/proxy_dmatrix.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../filesystem.h"  // for TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost::data {
@@ -25,9 +25,9 @@ TEST(FileIterator, Basic) {
     ASSERT_EQ(n_features, 5);
   };
 
-  dmlc::TemporaryDirectory tmpdir;
+  common::TemporaryDirectory tmpdir;
   {
-    auto zpath = tmpdir.path + "/0-based.svm";
+    auto zpath = tmpdir.Str() + "/0-based.svm";
     CreateBigTestData(zpath, 3 * 64, true);
     zpath += "?indexing_mode=0&format=libsvm";
     FileIterator iter{zpath, 0, 1};
@@ -35,7 +35,7 @@ TEST(FileIterator, Basic) {
   }
 
   {
-    auto opath = tmpdir.path + "/1-based.svm";
+    auto opath = tmpdir.Str() + "/1-based.svm";
     CreateBigTestData(opath, 3 * 64, false);
     opath += "?indexing_mode=1&format=libsvm";
     FileIterator iter{opath, 0, 1};
diff --git a/tests/cpp/data/test_gradient_index_page_raw_format.cc b/tests/cpp/data/test_gradient_index_page_raw_format.cc
index 3fa57235130f..2dae93f0ca12 100644
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>  // for Context
@@ -19,8 +19,8 @@ TEST(GHistIndexPageRawFormat, IO) {
   Context ctx;
 
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
-  dmlc::TemporaryDirectory tmpdir;
-  std::string path = tmpdir.path + "/ghistindex.page";
+  common::TemporaryDirectory tmpdir;
+  std::string path = tmpdir.Str() + "/ghistindex.page";
   auto batch = BatchParam{256, 0.5};
 
   common::HistogramCuts cuts;
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 77ec12d56002..26ad75e3b472 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -3,11 +3,15 @@
  */
 #include <gtest/gtest.h>
 
+#include <memory>  // for dynamic_pointer_cast
+
+#include "../../../src/common/io.h"  // for AlignedFileWriteStream
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/ellpack_page.h"
 #include "../../../src/data/iterative_dmatrix.h"
 #include "../../../src/tree/param.h"  // TrainParam
+#include "../filesystem.h"            // for TemporaryDirectory
 #include "../helpers.h"
 #include "test_iterative_dmatrix.h"
 
@@ -16,9 +20,9 @@ void TestEquivalent(float sparsity) {
   auto ctx = MakeCUDACtx(0);
 
   CudaArrayIterForTest iter{sparsity};
-  IterativeDMatrix m{&iter, iter.Proxy(), nullptr, Reset, Next,
-                     std::numeric_limits<float>::quiet_NaN(), 0, 256,
-                     std::numeric_limits<std::int64_t>::max()};
+  IterativeDMatrix m{&iter, iter.Proxy(), nullptr,
+                     Reset, Next,         std::numeric_limits<float>::quiet_NaN(),
+                     0,     256,          std::numeric_limits<std::int64_t>::max()};
   std::size_t offset = 0;
   auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated{new EllpackPageImpl{
@@ -96,15 +100,14 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
   std::string interface_str = iter.AsArray();
   Context ctx{MakeCUDACtx(0)};
   for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
-    n_batches ++;
+    n_batches++;
     auto impl = ellpack.Impl();
 
     auto cols = CudaArrayIterForTest::Cols();
     auto rows = CudaArrayIterForTest::Rows();
 
-    auto j_interface =
-        Json::Load({interface_str.c_str(), interface_str.size()});
-    ArrayInterface<2> loaded {get<Object const>(j_interface)};
+    auto j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
+    ArrayInterface<2> loaded{get<Object const>(j_interface)};
     std::vector<float> h_data(cols * rows);
     common::Span<float const> s_data{static_cast<float const*>(loaded.data), cols * rows};
     dh::CopyDeviceSpanToVector(&h_data, s_data);
@@ -140,8 +143,8 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
   h_data[1] = kMissing;
   h_data[5] = kMissing;
   h_data[6] = kMissing;
-  h_data[9] = kMissing;  // idx = (2, 0)
-  h_data[10] = kMissing; // idx = (2, 1)
+  h_data[9] = kMissing;   // idx = (2, 0)
+  h_data[10] = kMissing;  // idx = (2, 1)
   auto ptr =
       thrust::device_ptr<float>(reinterpret_cast<float*>(get<Integer>(j_interface["data"][0])));
   thrust::copy(h_data.cbegin(), h_data.cend(), ptr);
@@ -191,4 +194,37 @@ TEST(IterativeDeviceDMatrix, Ref) {
   TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
       &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
 }
+
+TEST(IterativeDeviceDMatrix, IO) {
+  auto ctx = MakeCUDACtx(0);
+  std::size_t n_samples = 2048, n_features = 128;
+  auto p_fmat = RandomDataGenerator{n_samples, n_features, 0.0}
+                    .Bins(32)
+                    .Device(ctx.Device())
+                    .GenerateQuantileDMatrix(true);
+  auto qdm = std::dynamic_pointer_cast<IterativeDMatrix>(p_fmat);
+  ASSERT_TRUE(qdm);
+  common::TemporaryDirectory tmpdir;
+  auto path = tmpdir.Path() / "data.qdm";
+  {
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(path.string(), "wb");
+    qdm->Save(fo.get());
+  }
+  auto fsize = std::filesystem::file_size(path);
+  auto fi = std::make_unique<common::MemBufFileReadStream>(path.string(), 0ul, fsize);
+  auto loaded = std::shared_ptr<IterativeDMatrix>(IterativeDMatrix::Load(fi.get()));
+  for (auto const& orig_page : qdm->GetBatches<EllpackPage>(&ctx, {})) {
+    for (auto const& new_page : loaded->GetBatches<EllpackPage>(&ctx, {})) {
+      std::vector<common::CompressedByteT> h_orig, h_new;
+      orig_page.Impl()->GetHostEllpack(&ctx, &h_orig);
+      new_page.Impl()->GetHostEllpack(&ctx, &h_new);
+      ASSERT_EQ(h_orig, h_new);
+      auto orig_cuts = orig_page.Impl()->Cuts();
+      auto new_cuts = new_page.Impl()->Cuts();
+      ASSERT_EQ(orig_cuts.Ptrs(), new_cuts.Ptrs());
+      ASSERT_EQ(orig_cuts.Values(), new_cuts.Values());
+      ASSERT_EQ(orig_cuts.MinValues(), new_cuts.MinValues());
+    }
+  }
+}
 }  // namespace xgboost::data
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index dffa0bfeddb7..a788ab9ec56d 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2024, XGBoost contributors
+ * Copyright 2016-2025, XGBoost contributors
  */
 #include "test_metainfo.h"
 
@@ -11,7 +11,7 @@
 #include <string>
 
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
-#include "../filesystem.h"              // dmlc::TemporaryDirectory
+#include "../filesystem.h"              // TemporaryDirectory
 #include "../helpers.h"                 // for GMockTHrow
 #include "xgboost/base.h"
 
@@ -153,8 +153,8 @@ TEST(MetaInfo, SaveLoadBinary) {
                  [](auto const &str) { return str.c_str(); });
   info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size());;
 
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/metainfo.binary";
+  common::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.Str() + "/metainfo.binary";
   {
     std::unique_ptr<dmlc::Stream> fs {
       dmlc::Stream::Create(tmp_file.c_str(), "w")
@@ -204,8 +204,8 @@ TEST(MetaInfo, SaveLoadBinary) {
 }
 
 TEST(MetaInfo, LoadQid) {
-  dmlc::TemporaryDirectory tempdir;
-  std::string tmp_file = tempdir.path + "/qid_test.libsvm";
+  common::TemporaryDirectory tempdir;
+  std::string tmp_file = tempdir.Str() + "/qid_test.libsvm";
   {
     std::unique_ptr<dmlc::Stream> fs(dmlc::Stream::Create(tmp_file.c_str(), "w"));
     dmlc::ostream os(fs.get());
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index feb5161025b2..1fb8c4e75c39 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -10,7 +10,7 @@
 #include "../../../src/data/adapter.h"         // ArrayAdapter
 #include "../../../src/data/simple_dmatrix.h"  // SimpleDMatrix
 #include "../collective/test_worker.h"         // for TestDistributedGlobal
-#include "../filesystem.h"                     // dmlc::TemporaryDirectory
+#include "../filesystem.h"                     // for TemporaryDirectory
 #include "../helpers.h"                        // RandomDataGenerator,CreateSimpleTestData
 #include "xgboost/base.h"
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
@@ -23,8 +23,8 @@ std::string UriSVM(std::string name) { return name + "?format=libsvm"; }
 }  // namespace
 
 TEST(SimpleDMatrix, MetaInfo) {
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/simple.libsvm";
+  common::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.Str() + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
   xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
 
@@ -39,8 +39,8 @@ TEST(SimpleDMatrix, MetaInfo) {
 }
 
 TEST(SimpleDMatrix, RowAccess) {
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/simple.libsvm";
+  common::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.Str() + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
   xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file), false);
 
@@ -63,8 +63,8 @@ TEST(SimpleDMatrix, RowAccess) {
 
 TEST(SimpleDMatrix, ColAccessWithoutBatches) {
   Context ctx;
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/simple.libsvm";
+  common::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.Str() + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
   xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
 
@@ -217,8 +217,8 @@ TEST(SimpleDMatrix, FromCSC) {
 }
 
 TEST(SimpleDMatrix, FromFile) {
-  dmlc::TemporaryDirectory tempdir;
-  std::string filename = tempdir.path + "test.libsvm";
+  common::TemporaryDirectory tempdir;
+  std::string filename = tempdir.Str() + "/test.libsvm";
   CreateBigTestData(filename, 3 * 5);
   // Add an empty row at the end of the matrix
   {
@@ -399,13 +399,13 @@ TEST(SimpleDMatrix, SliceCol) {
 }
 
 TEST(SimpleDMatrix, SaveLoadBinary) {
-  dmlc::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.path + "/simple.libsvm";
+  common::TemporaryDirectory tempdir;
+  const std::string tmp_file = tempdir.Str() + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
   xgboost::DMatrix * dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
   data::SimpleDMatrix *simple_dmat = dynamic_cast<data::SimpleDMatrix*>(dmat);
 
-  const std::string tmp_binfile = tempdir.path + "/csr_source.binary";
+  const std::string tmp_binfile = tempdir.Str() + "/csr_source.binary";
   simple_dmat->SaveToLocalFile(tmp_binfile);
   xgboost::DMatrix * dmat_read = xgboost::DMatrix::Load(tmp_binfile);
 
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 27ef71b48275..ad54d3fc8459 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -13,7 +13,7 @@
 #include "../../../src/data/batch_utils.h"  // for MatchingPageBytes
 #include "../../../src/data/sparse_page_dmatrix.h"
 #include "../../../src/tree/param.h"  // for TrainParam
-#include "../filesystem.h"            // dmlc::TemporaryDirectory
+#include "../filesystem.h"            // for TemporaryDirectory
 #include "../helpers.h"
 
 using namespace xgboost;  // NOLINT
@@ -147,10 +147,10 @@ INSTANTIATE_TEST_SUITE_P(SparsePageDMatrix, TestGradientIndexExt, testing::Bool(
 
 // Test GHistIndexMatrix can avoid loading sparse page after the initialization.
 TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
-  dmlc::TemporaryDirectory tmpdir;
+  common::TemporaryDirectory tmpdir;
   std::size_t n_batches = 6;
   auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
-      tmpdir.path + "/", true);
+      tmpdir.Str() + "/", true);
   Context ctx;
   bst_bin_t n_bins{256};
   double sparse_thresh{0.8};
@@ -229,9 +229,9 @@ TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
 }
 
 TEST(SparsePageDMatrix, MetaInfo) {
-  dmlc::TemporaryDirectory tmpdir;
+  common::TemporaryDirectory tmpdir;
   auto dmat = RandomDataGenerator{256, 5, 0.0}.Batches(4).GenerateSparsePageDMatrix(
-      tmpdir.path + "/", true);
+      tmpdir.Str() + "/", true);
 
   // Test the metadata that was parsed
   EXPECT_EQ(dmat->Info().num_row_, 256ul);
@@ -253,7 +253,7 @@ TEST(SparsePageDMatrix, RowAccess) {
 }
 
 TEST(SparsePageDMatrix, ColAccess) {
-  dmlc::TemporaryDirectory tempdir;
+  common::TemporaryDirectory tempdir;
   Context ctx;
 
   auto nan = std::numeric_limits<float>::quiet_NaN();
@@ -348,8 +348,8 @@ auto TestSparsePageDMatrixDeterminism(std::int32_t n_threads) {
   std::vector<size_t> sparse_rptr;
   std::vector<bst_feature_t> sparse_cids;
 
-  dmlc::TemporaryDirectory tmpdir;
-  auto prefix = (std::filesystem::path{tmpdir.path} / "temp").string();
+  common::TemporaryDirectory tmpdir;
+  auto prefix = (tmpdir.Path() / "temp").string();
   auto dmat = RandomDataGenerator{4096, 64, 0.0}.Batches(4).GenerateSparsePageDMatrix(prefix, true);
 
   auto config = ExtMemConfig{prefix,
diff --git a/tests/cpp/data/test_sparse_page_raw_format.cc b/tests/cpp/data/test_sparse_page_raw_format.cc
index 9f08c202fc5c..7e9dc05fd66f 100644
--- a/tests/cpp/data/test_sparse_page_raw_format.cc
+++ b/tests/cpp/data/test_sparse_page_raw_format.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>  // for CSCPage, SortedCSCPage, SparsePage
@@ -9,8 +9,8 @@
 
 #include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/sparse_page_writer.h"  // for CreatePageFormat
+#include "../filesystem.h"                         // for TemporaryDirectory
 #include "../helpers.h"                            // for RandomDataGenerator
-#include "dmlc/filesystem.h"                       // for TemporaryDirectory
 #include "xgboost/context.h"                       // for Context
 
 namespace xgboost::data {
@@ -20,8 +20,8 @@ template <typename S> void TestSparsePageRawFormat() {
 
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
   ASSERT_TRUE(m->SingleColBlock());
-  dmlc::TemporaryDirectory tmpdir;
-  std::string path = tmpdir.path + "/sparse.page";
+  common::TemporaryDirectory tmpdir;
+  std::string path = tmpdir.Str() + "/sparse.page";
   S orig;
   std::size_t n_bytes{0};
   {
diff --git a/tests/cpp/filesystem.cc b/tests/cpp/filesystem.cc
new file mode 100644
index 000000000000..42a65d646f31
--- /dev/null
+++ b/tests/cpp/filesystem.cc
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include "filesystem.h"
+
+#include <xgboost/windefs.h>
+
+#include <filesystem>  // for path, temp_directory_path
+
+#if !defined(xgboost_IS_WIN)
+
+#include <cstdlib>  // for mkdtemp
+
+#include "../../src/common/error_msg.h"  // for SystemError
+
+#else
+
+#include <random>  // for uniform_int_distribution
+
+#include "xgboost/string_view.h"  // for StringView
+
+#endif  // !defined(xgboost_IS_WIN)
+
+namespace xgboost::common {
+TemporaryDirectory::TemporaryDirectory(std::string prefix) : prefix_{std::move(prefix)} {
+  namespace fs = std::filesystem;
+
+  auto tmp = fs::temp_directory_path();
+
+#if defined(xgboost_IS_WIN)
+  std::default_random_engine rng;
+  auto make_name = [&rng, this] {
+    constexpr std::size_t kPathMax = 6;
+    constexpr StringView kAlphabet{"abcdefghijklmnopqrstuvwxyz"};
+    static_assert(kAlphabet.size() == 26);
+    std::uniform_int_distribution dist{0, 25};
+    char path[kPathMax + 1];
+    std::memset(path, 0, sizeof(path));
+    for (std::size_t i = 0; i < kPathMax; ++i) {
+      auto k = dist(rng);
+      path[i] = kAlphabet[k];
+    }
+    auto res = std::string{path};
+    CHECK_EQ(res.size(), kPathMax);
+    return this->prefix_ + "tmpdir." + std::string{path};
+  };
+  auto dirname = tmp / make_name();
+  std::int32_t retry = 0;
+  while (fs::exists(dirname) && retry < 64) {
+    dirname = tmp / make_name();
+    ++retry;
+  }
+  if (retry >= 64) {
+    LOG(FATAL) << "Failed to create temporary directory.";
+  }
+  this->path_ = dirname.string();
+  CHECK(fs::create_directory(this->path_));
+#else
+  auto dirtemplate = (tmp / (this->prefix_ + "tmpdir.XXXXXX")).string();
+  // https://man7.org/linux/man-pages/man3/mkdtemp.3.html
+  char* tmpdir = mkdtemp(dirtemplate.data());
+  if (!tmpdir) {
+    LOG(FATAL) << error::SystemError().message();
+  }
+  this->path_ = tmpdir;
+#endif
+  LOG(DEBUG) << "TmpDir:" << this->path_;
+  CHECK(fs::exists(this->path_));
+}
+
+TemporaryDirectory::~TemporaryDirectory() noexcept(false) {
+  std::filesystem::remove_all(this->path_);
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/filesystem.h b/tests/cpp/filesystem.h
index fafc8c7d1bf9..576a6f959afa 100644
--- a/tests/cpp/filesystem.h
+++ b/tests/cpp/filesystem.h
@@ -1,11 +1,25 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_TESTS_CPP_FILESYSTEM_H
 #define XGBOOST_TESTS_CPP_FILESYSTEM_H
 
-#include <xgboost/windefs.h>
+#include <filesystem>  // for path
 
-#include "dmlc/filesystem.h"
+namespace xgboost::common {
+class TemporaryDirectory {
+  std::filesystem::path path_;
+  std::string prefix_;
+
+ public:
+  explicit TemporaryDirectory(std::string prefix = "xgboost-");
+  ~TemporaryDirectory() noexcept(false);
+
+  [[nodiscard]] std::filesystem::path const& Path() const { return this->path_; }
+  // Path can be implicitly converted to string on unix, but not on windows, due its use
+  // of wchar.
+  [[nodiscard]] std::string Str() const { return this->path_.string(); }
+};
+}  // namespace xgboost::common
 
 #endif  // XGBOOST_TESTS_CPP_FILESYSTEM_H
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index c682fed7b0aa..701bb8864dcd 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -14,7 +14,7 @@
 
 #include "../../../src/data/proxy_dmatrix.h"  // for DMatrixProxy
 #include "../../../src/gbm/gbtree.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../filesystem.h"  // TemporaryDirectory
 #include "../helpers.h"
 #include "xgboost/base.h"
 #include "xgboost/predictor.h"
@@ -134,8 +134,8 @@ TEST(GBTree, ChoosePredictor) {
   }
   ASSERT_TRUE(data.HostCanWrite());
 
-  dmlc::TemporaryDirectory tempdir;
-  const std::string fname = tempdir.path + "/model_param.bst";
+  common::TemporaryDirectory tempdir;
+  const std::string fname = tempdir.Str() + "/model_param.bst";
   {
     std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
     learner->Save(fo.get());
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 21525a52d8c5..d960201d36de 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <filesystem>  // for path
 #include <limits>      // for numeric_limits
+#include <random>      // for mt19937
 
 #include "../../src/collective/communicator-inl.h"  // for GetRank
 #include "../../src/data/adapter.h"
@@ -601,6 +602,19 @@ int NumpyArrayIterForTest::Next() {
   return 1;
 }
 
+[[nodiscard]] std::vector<float> GenerateRandomCategoricalSingleColumn(std::size_t n,
+                                                                       std::size_t n_categories) {
+  std::vector<float> x(n);
+  std::mt19937 rng(0);
+  std::uniform_int_distribution<size_t> dist(0, n_categories - 1);
+  std::generate(x.begin(), x.end(), [&]() { return static_cast<float>(dist(rng)); });
+  // Make sure each category is present
+  for (size_t i = 0; i < n_categories; i++) {
+    x[i] = static_cast<decltype(x)::value_type>(i);
+  }
+  return x;
+}
+
 std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
                                             bst_feature_t num_columns) {
   data::DenseAdapter adapter(x.data(), num_rows, num_columns);
@@ -613,11 +627,11 @@ std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::si
 
 [[nodiscard]] std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
     HostDeviceVector<float> const& x, bst_idx_t n_samples, bst_feature_t n_features,
-    const dmlc::TemporaryDirectory& tempdir, bst_idx_t n_batches) {
+    const common::TemporaryDirectory& tempdir, bst_idx_t n_batches) {
   Context ctx;
   auto iter = NumpyArrayIterForTest{&ctx, x, n_samples / n_batches, n_features, n_batches};
 
-  auto prefix = std::filesystem::path{tempdir.path} / "temp";
+  auto prefix = tempdir.Path() / "temp";
   auto config = ExtMemConfig{
       prefix.string(),
       false,
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 31677d6215bf..a4d3559bce16 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -25,7 +25,7 @@
 #include "../../src/common/cuda_rt_utils.h"         // for AllVisibleGPUs
 #endif  // defined(__CUDACC__)
 
-#include "filesystem.h"  // dmlc::TemporaryDirectory
+#include "filesystem.h"  // for TemporaryDirectory
 #include "xgboost/linalg.h"
 
 #if defined(__CUDACC__)
@@ -355,24 +355,15 @@ inline std::shared_ptr<DMatrix> EmptyDMatrix() {
   return RandomDataGenerator{0, 0, 0.0}.GenerateDMatrix();
 }
 
-inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
-  std::vector<float> x(n);
-  std::mt19937 rng(0);
-  std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
-  std::generate(x.begin(), x.end(), [&]() { return static_cast<float>(dist(rng)); });
-  // Make sure each category is present
-  for (size_t i = 0; i < num_categories; i++) {
-    x[i] = static_cast<decltype(x)::value_type>(i);
-  }
-  return x;
-}
+[[nodiscard]] std::vector<float> GenerateRandomCategoricalSingleColumn(std::size_t n,
+                                                                       std::size_t n_categories);
 
 std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
                                             bst_feature_t num_columns);
 
 [[nodiscard]] std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
     HostDeviceVector<float> const& x, bst_idx_t n_samples, bst_feature_t n_features,
-    const dmlc::TemporaryDirectory& tempdir, bst_idx_t n_batches = 4);
+    const common::TemporaryDirectory& tempdir, bst_idx_t n_batches = 4);
 
 std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs, size_t kRows,
                                                   size_t kCols,
diff --git a/tests/cpp/plugin/federated/test_federated_data.cc b/tests/cpp/plugin/federated/test_federated_data.cc
index 664f85ac8926..c3c3fd1c0f81 100644
--- a/tests/cpp/plugin/federated/test_federated_data.cc
+++ b/tests/cpp/plugin/federated/test_federated_data.cc
@@ -1,11 +1,11 @@
 /**
- * Copyright 2023-2024, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 
 #include "../../../../src/collective/communicator-inl.h"
-#include "../../filesystem.h"
+#include "../../filesystem.h"  // for test_federated_data
 #include "../../helpers.h"
 #include "test_worker.h"
 
@@ -17,12 +17,12 @@ void VerifyLoadUri() {
   size_t constexpr kRows{16};
   size_t const kCols = 8 + rank;
 
-  dmlc::TemporaryDirectory tmpdir;
-  std::string path = tmpdir.path + "/small" + std::to_string(rank) + ".csv";
-  CreateTestCSV(path, kRows, kCols);
+  common::TemporaryDirectory tmpdir;
+  auto path = tmpdir.Path() / ("small" + std::to_string(rank) + ".csv");
+  CreateTestCSV(path.string(), kRows, kCols);
 
   std::unique_ptr<DMatrix> dmat;
-  std::string uri = path + "?format=csv";
+  std::string uri = path.string() + "?format=csv";
   dmat.reset(DMatrix::Load(uri, false, DataSplitMode::kCol));
 
   ASSERT_EQ(dmat->Info().num_col_, 8 * collective::GetWorldSize() + 1);
diff --git a/tests/cpp/test_helpers.cc b/tests/cpp/test_helpers.cc
index 529f94e243f1..bfc2100dfe49 100644
--- a/tests/cpp/test_helpers.cc
+++ b/tests/cpp/test_helpers.cc
@@ -1,7 +1,11 @@
+/**
+ * Copyright 2020-2025, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
 #include <algorithm>
 
 #include "helpers.h"
+#include "filesystem.h"  // for TemporaryDirectory
 #include "../../src/data/array_interface.h"
 namespace xgboost {
 
@@ -71,8 +75,8 @@ TEST(RandomDataGenerator, GenerateArrayInterfaceBatch) {
 TEST(RandomDataGenerator, SparseDMatrix) {
   bst_idx_t constexpr kCols{100}, kBatches{13};
   bst_idx_t n_samples{kBatches * 128};
-  dmlc::TemporaryDirectory tmpdir;
-  auto prefix = tmpdir.path + "/cache";
+  common::TemporaryDirectory tmpdir;
+  auto prefix = tmpdir.Str() + "/cache";
   auto p_ext_fmat =
       RandomDataGenerator{n_samples, kCols, 0.0}.Batches(kBatches).GenerateSparsePageDMatrix(prefix,
                                                                                              true);
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 5c4c3d62950d..b0eb5766a871 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -166,13 +166,13 @@ TEST(Learner, JsonModelIO) {
     Json out { Object() };
     learner->SaveModel(&out);
 
-    dmlc::TemporaryDirectory tmpdir;
+    common::TemporaryDirectory tmpdir;
 
-    std::ofstream fout (tmpdir.path + "/model.json");
+    std::ofstream fout (tmpdir.Path() / "model.json");
     fout << out;
     fout.close();
 
-    auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json");
+    auto loaded_str = common::LoadSequentialFile(tmpdir.Str() + "/model.json");
     Json loaded = Json::Load(StringView{loaded_str.data(), loaded_str.size()});
 
     learner->LoadModel(loaded);
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 083b209941bd..e953c78b0a0e 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2019-2023, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>
@@ -12,7 +12,7 @@
 
 #include "../../src/common/io.h"
 #include "../../src/common/random.h"
-#include "filesystem.h"  // dmlc::TemporaryDirectory
+#include "filesystem.h"  // for TemporaryDirectory
 #include "helpers.h"
 
 namespace xgboost {
@@ -129,8 +129,8 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
 
   int32_t constexpr kIters = 2;
 
-  dmlc::TemporaryDirectory tempdir;
-  std::string const fname = tempdir.path + "/model";
+  common::TemporaryDirectory tempdir;
+  std::string const fname = tempdir.Str() + "/model";
 
   std::vector<std::string> dumped_0;
   std::string model_at_kiter;

From f47b02fc17a0634520e46df10f93d674dba1dd51 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 4 Sep 2025 03:06:34 +0800
Subject: [PATCH 158/224] [test] Synthesize the California housing dataset.
 (#11672)

---
 demo/guide-python/gpu_tree_shap.py            | 24 +++++---
 demo/guide-python/sklearn_examples.py         | 16 +++++-
 demo/guide-python/sklearn_parallel.py         |  9 ++-
 demo/guide-python/update_process.py           | 14 +++--
 ops/script/lint_python.py                     |  1 +
 python-package/xgboost/plotting.py            |  2 +-
 python-package/xgboost/testing/data.py        | 56 +++++++++++++++++--
 python-package/xgboost/testing/with_skl.py    |  4 +-
 tests/python/test_predict.py                  |  6 +-
 tests/python/test_with_shap.py                |  5 +-
 tests/python/test_with_sklearn.py             | 14 ++---
 .../test_with_dask/test_with_dask.py          |  5 +-
 12 files changed, 115 insertions(+), 41 deletions(-)

diff --git a/demo/guide-python/gpu_tree_shap.py b/demo/guide-python/gpu_tree_shap.py
index d591307e0c1b..f0e772ff3cc4 100644
--- a/demo/guide-python/gpu_tree_shap.py
+++ b/demo/guide-python/gpu_tree_shap.py
@@ -5,16 +5,24 @@
 Demonstrates using GPU acceleration to compute SHAP values for feature importance.
 
 """
+from urllib.error import HTTPError
+
 import shap
-from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_california_housing, make_regression
 
 import xgboost as xgb
 
 # Fetch dataset using sklearn
-data = fetch_california_housing()
-print(data.DESCR)
-X = data.data
-y = data.target
+try:
+    _data = fetch_california_housing(return_X_y=True)
+    X = _data.data
+    y = _data.target
+    feature_names = _data.feature_names
+    print(_data.DESCR)
+except HTTPError:
+    # Use a synthetic dataset instead if we couldn't
+    X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
+    feature_names = [f"f{i}" for i in range(8)]
 
 num_round = 500
 
@@ -26,7 +34,7 @@
 }
 
 # GPU accelerated training
-dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
+dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
 model = xgb.train(param, dtrain, num_round)
 
 # Compute shap values using GPU with xgboost
@@ -47,9 +55,9 @@
     explainer.expected_value,
     shap_values[0, :],
     X[0, :],
-    feature_names=data.feature_names,
+    feature_names=feature_names,
     matplotlib=True,
 )
 
 # Show a summary of feature importance
-shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
+shap.summary_plot(shap_values, X, plot_type="bar", feature_names=feature_names)
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
index 42baf6883f85..4e0392988cb3 100644
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -11,9 +11,15 @@
 """
 
 import pickle
+from urllib.error import HTTPError
 
 import numpy as np
-from sklearn.datasets import fetch_california_housing, load_digits, load_iris
+from sklearn.datasets import (
+    fetch_california_housing,
+    load_digits,
+    load_iris,
+    make_regression,
+)
 from sklearn.metrics import confusion_matrix, mean_squared_error
 from sklearn.model_selection import GridSearchCV, KFold, train_test_split
 
@@ -44,7 +50,13 @@
     print(confusion_matrix(actuals, predictions))
 
 print("California Housing: regression")
-X, y = fetch_california_housing(return_X_y=True)
+
+try:
+    X, y = fetch_california_housing(return_X_y=True)
+except HTTPError:
+    # Use a synthetic dataset instead if we couldn't
+    X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
+
 kf = KFold(n_splits=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf.split(X):
     xgb_model = xgb.XGBRegressor(n_jobs=1).fit(X[train_index], y[train_index])
diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py
index db5303ab7118..2f62d2b48ca3 100644
--- a/demo/guide-python/sklearn_parallel.py
+++ b/demo/guide-python/sklearn_parallel.py
@@ -4,15 +4,20 @@
 """
 
 import multiprocessing
+from urllib.error import HTTPError
 
-from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_california_housing, make_regression
 from sklearn.model_selection import GridSearchCV
 
 import xgboost as xgb
 
 if __name__ == "__main__":
     print("Parallel Parameter optimization")
-    X, y = fetch_california_housing(return_X_y=True)
+    try:
+        X, y = fetch_california_housing(return_X_y=True)
+    except HTTPError:
+        # Use a synthetic dataset instead if we couldn't
+        X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
     # Make sure the number of threads is balanced.
     xgb_model = xgb.XGBRegressor(
         n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"
diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py
index 17bbbc39c82c..2ef6e7fc31c9 100644
--- a/demo/guide-python/update_process.py
+++ b/demo/guide-python/update_process.py
@@ -7,16 +7,22 @@
 
 """
 
+from urllib.error import HTTPError
+
 import numpy as np
-from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_california_housing, make_regression
 
 import xgboost as xgb
 
 
-def main():
+def main() -> None:
     n_rounds = 32
 
-    X, y = fetch_california_housing(return_X_y=True)
+    try:
+        X, y = fetch_california_housing(return_X_y=True)
+    except HTTPError:
+        # Use a synthetic dataset instead if we couldn't
+        X, y = make_regression(n_samples=20640, n_features=8, random_state=1234)
 
     # Train a model first
     X_train = X[: X.shape[0] // 2]
@@ -50,7 +56,7 @@ def main():
 
     # Refresh the model without changing the leaf value, but tree statistic including
     # cover and weight are refreshed.
-    refresh_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    refresh_result = {}
     refreshed = xgb.train(
         {"process_type": "update", "updater": "refresh", "refresh_leaf": False},
         Xy_refresh,
diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py
index 59062f69f1c2..e47b7977beb8 100644
--- a/ops/script/lint_python.py
+++ b/ops/script/lint_python.py
@@ -131,6 +131,7 @@ class LintersPaths:
         "demo/guide-python/sklearn_examples.py",
         "demo/guide-python/continuation.py",
         "demo/guide-python/callbacks.py",
+        "demo/guide-python/update_process.py",
         "demo/guide-python/cat_in_the_dat.py",
         "demo/guide-python/categorical.py",
         "demo/guide-python/cat_pipeline.py",
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
index 552261c2d1a0..2d9d99b4e4cb 100644
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -120,7 +120,7 @@ def plot_importance(
 
     if show_values is True:
         for x, y in zip(values, ylocs):
-            ax.text(x + 1, y, values_format.format(v=x), va="center")
+            ax.text(x + 1, float(y), values_format.format(v=x), va="center")
 
     ax.set_yticks(ylocs)
     ax.set_yticklabels(labels)
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 1037a44c62c7..f61217d7ea9b 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -241,10 +241,58 @@ def check_inf(rng: RNG) -> None:
 
 @memory.cache
 def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """Fetch the California housing dataset from sklearn."""
-    datasets = pytest.importorskip("sklearn.datasets")
-    data = datasets.fetch_california_housing()
-    return data.data, data.target
+    """Synthesize a dataset similar to the sklearn California housing dataset.
+
+    The real one can be obtained via:
+
+    .. code-block::
+
+        import sklearn.datasets
+
+        X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)
+
+    """
+    n_samples = 20640
+    rng = np.random.default_rng(2025)
+
+    pd = pytest.importorskip("pandas")
+
+    def mixture_2comp(
+        means: List[float], sigmas: List[float], weights: List[float]
+    ) -> np.ndarray:
+        l0 = rng.normal(
+            size=(int(n_samples * weights[0])), loc=means[0], scale=sigmas[0]
+        )
+        l1 = rng.normal(size=(n_samples - l0.shape[0]), loc=means[1], scale=sigmas[1])
+        return np.concatenate([l0, l1], axis=0)
+
+    def norm(mean: float, std: float) -> np.ndarray:
+        return rng.normal(loc=mean, scale=std, size=(n_samples,))
+
+    df = pd.DataFrame(
+        {
+            "Longitude": mixture_2comp(
+                [-118.0703597, -121.85682825],
+                [0.7897320650373969, 0.7248398629412008],
+                [0.60402556, 0.39597444],
+            ),
+            "Latitude": mixture_2comp(
+                [37.84266317, 33.86030848],
+                [1.0643911549736087, 0.5049274656834589],
+                [0.44485062, 0.55514938],
+            ),
+            "MedInc": norm(mean=3.8706710029069766, std=1.8997756945748738),
+            "HouseAge": norm(mean=28.639486434108527, std=12.585252725724606),
+            "AveRooms": norm(mean=5.428999742190376, std=2.474113202333516),
+            "AveBedrms": norm(mean=1.096675149606208, std=0.47389937625774475),
+            "Population": norm(mean=1425.4767441860465, std=1132.434687757615),
+            "AveOccup": norm(mean=3.0706551594363742, std=10.385797959128219),
+            "MedHouseVal": norm(mean=2.068558169089147, std=1.1539282040412253),
+        }
+    )
+    X = df[df.columns.difference(["MedHouseVal"])].to_numpy()
+    y = df["MedHouseVal"].to_numpy()
+    return X, y
 
 
 @memory.cache
diff --git a/python-package/xgboost/testing/with_skl.py b/python-package/xgboost/testing/with_skl.py
index a91074014ecf..cc78b95b00df 100644
--- a/python-package/xgboost/testing/with_skl.py
+++ b/python-package/xgboost/testing/with_skl.py
@@ -8,6 +8,7 @@
 
 from ..core import DMatrix
 from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
+from .data import get_california_housing
 from .ordinal import make_recoded
 from .utils import Device
 
@@ -114,11 +115,10 @@ def run_boost_from_prediction_multi_clasas(
 
 def run_housing_rf_regression(tree_method: str, device: Device) -> None:
     """Testwith the cali housing dataset."""
-    from sklearn.datasets import fetch_california_housing
     from sklearn.metrics import mean_squared_error
     from sklearn.model_selection import KFold
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     rng = np.random.RandomState(1994)
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index a7de6dd76883..5994d653a796 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -10,7 +10,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.data import np_dtypes, pd_dtypes
+from xgboost.testing.data import get_california_housing, np_dtypes, pd_dtypes
 from xgboost.testing.predict import run_base_margin_vs_base_score, run_predict_leaf
 
 
@@ -36,9 +36,7 @@ def test_predict_leaf(DMatrixT: Type[xgb.DMatrix]) -> None:
 
 
 def test_predict_shape():
-    from sklearn.datasets import fetch_california_housing
-
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     reg = xgb.XGBRegressor(n_estimators=1)
     reg.fit(X, y)
     predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
diff --git a/tests/python/test_with_shap.py b/tests/python/test_with_shap.py
index 63d0fd11b701..1aeb56f7a54f 100644
--- a/tests/python/test_with_shap.py
+++ b/tests/python/test_with_shap.py
@@ -2,6 +2,7 @@
 import pytest
 
 import xgboost as xgb
+from xgboost.testing.data import get_california_housing
 
 try:
     import shap
@@ -16,9 +17,7 @@
 # xgboost removed ntree_limit in 2.0, which breaks the SHAP package.
 @pytest.mark.xfail
 def test_with_shap() -> None:
-    from sklearn.datasets import fetch_california_housing
-
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     dtrain = xgb.DMatrix(X, label=y)
     model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
     explainer = shap.TreeExplainer(model)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 2adea0284d1c..10b239457474 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -12,6 +12,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import get_california_housing
 from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
 from xgboost.testing.updater import get_basescore
@@ -464,11 +465,10 @@ def test_num_parallel_tree():
 
 
 def test_regression():
-    from sklearn.datasets import fetch_california_housing
     from sklearn.metrics import mean_squared_error
     from sklearn.model_selection import KFold
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
@@ -501,10 +501,9 @@ def test_rf_regression():
 
 @pytest.mark.parametrize("tree_method", ["exact", "hist", "approx"])
 def test_parameter_tuning(tree_method: str) -> None:
-    from sklearn.datasets import fetch_california_housing
     from sklearn.model_selection import GridSearchCV
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     reg = xgb.XGBRegressor(learning_rate=0.1, tree_method=tree_method)
     grid_cv = GridSearchCV(
         reg, {"max_depth": [2, 4], "n_estimators": [50, 200]}, cv=2, verbose=1
@@ -512,17 +511,16 @@ def test_parameter_tuning(tree_method: str) -> None:
     grid_cv.fit(X, y)
     assert grid_cv.best_score_ < 0.7
     assert grid_cv.best_params_ == {
-        "n_estimators": 200,
-        "max_depth": 4 if tree_method == "exact" else 2,
+        "n_estimators": 50,
+        "max_depth": 2,
     }
 
 
 def test_regression_with_custom_objective():
-    from sklearn.datasets import fetch_california_housing
     from sklearn.metrics import mean_squared_error
     from sklearn.model_selection import KFold
 
-    X, y = fetch_california_housing(return_X_y=True)
+    X, y = get_california_housing()
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBRegressor(objective=tm.ls_obj).fit(
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 1c0eab14e4f7..2b6fdec67306 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -38,6 +38,7 @@
     make_categorical,
     run_recode,
 )
+from xgboost.testing.data import get_california_housing
 from xgboost.testing.params import hist_cache_strategy, hist_parameter_strategy
 from xgboost.testing.shared import (
     get_feature_weights,
@@ -1629,9 +1630,7 @@ def test_feature_weights(self, client: "Client") -> None:
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_custom_objective(self, client: "Client") -> None:
-        from sklearn.datasets import fetch_california_housing
-
-        X, y = fetch_california_housing(return_X_y=True)
+        X, y = get_california_housing()
         X, y = da.from_array(X), da.from_array(y)
         rounds = 20
 

From 2463938c0c655ebf3a09cbaa9a0d4c057d2a3c0e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 5 Sep 2025 03:01:36 +0800
Subject: [PATCH 159/224] [mt] Implement vector intercept. (#11656)

- Extract the fit intercept procedures from the learner.
- Implement vector transform in objectives.
- Implement estimation for softmax objective.
- Remove scalar intercept. It will always return a vector.
- A lot of changes due to the new base score storage type.
---
 R-package/DESCRIPTION                         |   2 +-
 R-package/NAMESPACE                           |   1 +
 R-package/R/xgb.Booster.R                     |   8 +-
 R-package/R/xgboost.R                         |   1 +
 R-package/src/Makevars.in                     |   2 +
 R-package/src/Makevars.win.in                 |   2 +
 R-package/tests/testthat/helper_model.R       |   7 +
 .../tests/testthat/test_custom_objective.R    |   9 +-
 .../tests/testthat/test_model_compatibility.R |   8 +-
 .../tests/testthat/test_poisson_regression.R  |   4 +-
 demo/guide-python/custom_softmax.py           | 122 +++++----
 demo/guide-python/model_parser.py             |   4 +-
 doc/parameter.rst                             |  15 +-
 include/xgboost/learner.h                     |   8 +-
 include/xgboost/linalg.h                      |   4 +-
 include/xgboost/objective.h                   |   4 +-
 ops/script/lint_python.py                     |   2 +
 plugin/example/custom_obj.cc                  |  14 +-
 plugin/sycl/predictor/predictor.cc            |  32 ---
 python-package/xgboost/core.py                |   2 +
 python-package/xgboost/sklearn.py             |  14 +-
 python-package/xgboost/testing/updater.py     | 111 ++++++--
 python-package/xgboost/testing/with_skl.py    |  69 ++++-
 src/collective/aggregator.h                   |  14 +
 src/collective/coll.cu                        |   4 +-
 src/common/error_msg.cc                       |  13 +
 src/common/error_msg.h                        |   2 +
 src/common/linalg_op.cc                       |  40 +++
 src/common/linalg_op.cu                       |  49 ++++
 src/common/linalg_op.cuh                      |   3 +
 src/common/linalg_op.h                        |  35 +++
 src/common/math.h                             |   8 +-
 src/common/optional_weight.cc                 |  39 +++
 src/common/optional_weight.cu                 |  18 ++
 src/common/optional_weight.h                  |  13 +-
 src/common/param_array.cc                     |  23 +-
 src/common/param_array.h                      |  19 +-
 src/common/quantile_loss_utils.h              |  11 +-
 src/common/{cleanup.h => utils.h}             |  25 +-
 src/gbm/gbtree.cc                             |   3 +-
 src/gbm/gbtree.cu                             |   6 +-
 src/learner.cc                                | 259 +++++++++++++-----
 src/objective/aft_obj.cu                      |  30 +-
 src/objective/init_estimation.cc              |  15 +-
 src/objective/multiclass_obj.cu               |  26 +-
 src/objective/objective.cc                    |  19 +-
 src/objective/quantile_obj.cu                 |  30 +-
 src/objective/regression_loss.h               |  40 ++-
 src/objective/regression_obj.cu               |  89 +++---
 src/predictor/predictor.cc                    |  60 +++-
 src/predictor/predictor.cu                    |  19 ++
 tests/cpp/common/test_linalg.cu               |  32 ++-
 tests/cpp/common/test_math.cc                 |  16 ++
 tests/cpp/common/test_param_array.cc          |  21 +-
 tests/cpp/helpers.cc                          |  10 +-
 tests/cpp/helpers.h                           |   3 +-
 tests/cpp/objective/test_quantile_obj.cc      |  12 +-
 tests/cpp/objective/test_regression_obj.cc    |  40 +--
 .../federated/test_federated_learner.cc       |   5 +-
 tests/cpp/predictor/test_predictor.cc         |  11 +-
 tests/cpp/predictor/test_predictor.h          |   2 +-
 tests/cpp/test_learner.cc                     |  33 ++-
 tests/cpp/test_multi_target.cc                |  17 +-
 tests/python-gpu/test_gpu_with_sklearn.py     |   5 +
 tests/python/test_basic_models.py             |   8 +-
 tests/python/test_early_stopping.py           |   2 +-
 tests/python/test_model_compatibility.py      |   7 +-
 tests/python/test_model_io.py                 |   2 +-
 tests/python/test_ranking.py                  |   2 +-
 tests/python/test_with_sklearn.py             |  16 +-
 .../test_gpu_with_dask/test_gpu_with_dask.py  |   5 +-
 .../test_with_dask/test_with_dask.py          |   8 +-
 .../test_spark_local_cluster.py               |   4 +-
 73 files changed, 1137 insertions(+), 481 deletions(-)
 create mode 100644 R-package/tests/testthat/helper_model.R
 create mode 100644 src/common/linalg_op.cc
 create mode 100644 src/common/linalg_op.cu
 create mode 100644 src/common/optional_weight.cc
 create mode 100644 src/common/optional_weight.cu
 rename src/common/{cleanup.h => utils.h} (50%)
 create mode 100644 src/predictor/predictor.cu
 create mode 100644 tests/cpp/common/test_math.cc

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 47be99b20dc9..4aeb51857403 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -71,6 +71,6 @@ Imports:
     data.table (>= 1.9.6),
     jsonlite (>= 1.0)
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Encoding: UTF-8
 SystemRequirements: GNU make, C++17
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 1fa0d7a9cf46..af37b12c7c01 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -103,6 +103,7 @@ importFrom(stats,median)
 importFrom(stats,predict)
 importFrom(stats,sd)
 importFrom(stats,variable.names)
+importFrom(utils,hasName)
 importFrom(utils,head)
 importFrom(utils,object.size)
 importFrom(utils,str)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 485a1bc23830..19a197a5b3b3 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -1104,8 +1104,9 @@ coef.xgb.Booster <- function(object, ...) {
   if (booster_type != "gblinear") {
     stop("Coefficients are not defined for Booster type ", booster_type)
   }
-  model_json <- jsonlite::fromJSON(rawToChar(xgb.save.raw(object, raw_format = "json")))
-  base_score <- model_json$learner$learner_model_param$base_score
+  model_json <- jsonlite::fromJSON(
+    rawToChar(xgb.save.raw(object, raw_format = "json"))
+  )
   num_feature <- as.numeric(model_json$learner$learner_model_param$num_feature)
 
   weights <- model_json$learner$gradient_booster$model$weights
@@ -1116,6 +1117,9 @@ coef.xgb.Booster <- function(object, ...) {
   sep <- num_feature * n_cols
   coefs <- weights[seq(1, sep)]
   intercepts <- weights[seq(sep + 1, length(weights))]
+  base_score <- jsonlite::fromJSON(
+    model_json$learner$learner_model_param$base_score
+  )
   intercepts <- intercepts + as.numeric(base_score)
 
   if (add_names) {
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 9ebc866cbcf2..a77d9d42387d 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -1596,6 +1596,7 @@ NULL
 #' @importFrom stats sd
 #' @importFrom stats variable.names
 #' @importFrom utils head
+#' @importFrom utils hasName
 #' @importFrom graphics barplot
 #' @importFrom graphics lines
 #' @importFrom graphics points
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 57c515e3515c..eb0b96bafeb2 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -127,7 +127,9 @@ OBJECTS= \
     $(PKGROOT)/src/common/host_device_vector.o \
     $(PKGROOT)/src/common/io.o \
     $(PKGROOT)/src/common/json.o \
+    $(PKGROOT)/src/common/linalg_op.o \
     $(PKGROOT)/src/common/numeric.o \
+    $(PKGROOT)/src/common/optional_weight.o \
     $(PKGROOT)/src/common/pseudo_huber.o \
     $(PKGROOT)/src/common/quantile.o \
     $(PKGROOT)/src/common/random.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 1b84f186201d..3d2fe1fcee77 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -126,7 +126,9 @@ OBJECTS= \
     $(PKGROOT)/src/common/host_device_vector.o \
     $(PKGROOT)/src/common/io.o \
     $(PKGROOT)/src/common/json.o \
+    $(PKGROOT)/src/common/linalg_op.o \
     $(PKGROOT)/src/common/numeric.o \
+    $(PKGROOT)/src/common/optional_weight.o \
     $(PKGROOT)/src/common/pseudo_huber.o \
     $(PKGROOT)/src/common/quantile.o \
     $(PKGROOT)/src/common/random.o \
diff --git a/R-package/tests/testthat/helper_model.R b/R-package/tests/testthat/helper_model.R
new file mode 100644
index 000000000000..4a11ce52042f
--- /dev/null
+++ b/R-package/tests/testthat/helper_model.R
@@ -0,0 +1,7 @@
+## A special file sourced by testthat.
+
+get_basescore <- function(model) {
+  as.numeric(
+    jsonlite::fromJSON(model$learner$learner_model_param$base_score)
+  )
+}
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index 30a854808610..d3185e7fccdd 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -77,12 +77,17 @@ test_that("custom objective with multi-class shape", {
   n_classes <- 3
 
   fake_softprob <- function(preds, dtrain) {
-    expect_true(all(matrix(preds) == 0.5))
+    mpreds <- as.matrix(preds)
+    expect_equal(
+      mpreds,
+      matrix(0.5, nrow = nrow(mpreds), ncol = ncol(mpreds)),
+      tolerance = 1e-4
+    )
     ## use numeric vector here to test compatibility with XGBoost < 2.1
     grad <- rnorm(length(as.matrix(preds)))
     expect_equal(dim(data)[1] * n_classes, dim(as.matrix(preds))[1] * n_classes)
     hess <- rnorm(length(as.matrix(preds)))
-    return(list(grad = grad, hess = hess))
+    list(grad = grad, hess = hess)
   }
   fake_merror <- function(preds, dtrain) {
     expect_equal(dim(data)[1] * n_classes, dim(as.matrix(preds))[1])
diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R
index 97bf757cf221..b084a9bff3bc 100644
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -42,9 +42,7 @@ run_booster_check <- function(booster, model_file) {
     testthat::expect_equal(
       get_num_tree(booster), metadata$kForests * n_rounds * metadata$kClasses
     )
-    testthat::expect_equal(
-      as.numeric(config$learner$learner_model_param$base_score), 0.5
-    )
+    testthat::expect_equal(get_basescore(config), c(0.5, 0.5, 0.5))  # nolint
     testthat::expect_equal(
       config$learner$learner_train_param$objective, "multi:softmax"
     )
@@ -81,9 +79,7 @@ run_booster_check <- function(booster, model_file) {
   } else {
     testthat::expect_true(is_model("reg"))
     testthat::expect_equal(get_num_tree(booster), metadata$kForests * n_rounds)
-    testthat::expect_equal(
-      as.numeric(config$learner$learner_model_param$base_score), 0.5
-    )
+    testthat::expect_equal(get_basescore(config), 0.5)  # nolint
     testthat::expect_equal(
       config$learner$learner_train_param$objective, "reg:squarederror"
     )
diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R
index 3aef55fa1e97..f73bc34878af 100644
--- a/R-package/tests/testthat/test_poisson_regression.R
+++ b/R-package/tests/testthat/test_poisson_regression.R
@@ -27,7 +27,7 @@ test_that("Poisson regression is centered around mean", {
   )
   model_json <- xgb.save.raw(model, "json") |> rawToChar() |> jsonlite::fromJSON()
   expect_equal(
-    model_json$learner$learner_model_param$base_score |> as.numeric(),
+    get_basescore(model_json),
     mean(y),
     tolerance = 1e-4
   )
@@ -47,7 +47,7 @@ test_that("Poisson regression is centered around mean", {
   )
   model_json <- xgb.save.raw(model_weighted, "json") |> rawToChar() |> jsonlite::fromJSON()
   expect_equal(
-    model_json$learner$learner_model_param$base_score |> as.numeric(),
+    get_basescore(model_json),
     weighted.mean(y, w),
     tolerance = 1e-4
   )
diff --git a/demo/guide-python/custom_softmax.py b/demo/guide-python/custom_softmax.py
index 2d2ebae2041b..207b38d01f37 100644
--- a/demo/guide-python/custom_softmax.py
+++ b/demo/guide-python/custom_softmax.py
@@ -1,4 +1,4 @@
-'''
+"""
 Demo for creating customized multi-class objective function
 ===========================================================
 
@@ -9,9 +9,10 @@
 See :doc:`/tutorials/custom_metric_obj` and :doc:`/tutorials/advanced_custom_obj` for
 detailed tutorial and notes.
 
-'''
+"""
 
 import argparse
+from typing import Dict, Tuple
 
 import numpy as np
 from matplotlib import pyplot as plt
@@ -22,9 +23,9 @@
 
 kRows = 100
 kCols = 10
-kClasses = 4                    # number of classes
+kClasses = 4  # number of classes
 
-kRounds = 10                    # number of boosting rounds.
+kRounds = 10  # number of boosting rounds.
 
 # Generate some random data for demo.
 X = np.random.randn(kRows, kCols)
@@ -33,19 +34,19 @@
 m = xgb.DMatrix(X, y)
 
 
-def softmax(x):
-    '''Softmax function with x as input vector.'''
+def softmax(x: np.ndarray) -> np.ndarray:
+    """Softmax function with x as input vector."""
     e = np.exp(x)
     return e / np.sum(e)
 
 
-def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
-    '''Loss function. Computing the gradient and upper bound on the
+def softprob_obj(predt: np.ndarray, data: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
+    """Loss function. Computing the gradient and upper bound on the
     Hessian with a diagonal structure for XGBoost (note that this is
     not the true Hessian).
     Reimplements the `multi:softprob` inside XGBoost.
 
-    '''
+    """
     labels = data.get_label()
     if data.get_weight().size == 0:
         # Use 1 as weight if we don't have custom weight.
@@ -83,11 +84,11 @@ def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
     return grad, hess
 
 
-def predict(booster: xgb.Booster, X):
-    '''A customized prediction function that converts raw prediction to
+def predict(booster: xgb.Booster, X: xgb.DMatrix) -> np.ndarray:
+    """A customized prediction function that converts raw prediction to
     target class.
 
-    '''
+    """
     # Output margin means we want to obtain the raw prediction obtained from
     # tree leaf weight.
     predt = booster.predict(X, output_margin=True)
@@ -101,14 +102,14 @@ def predict(booster: xgb.Booster, X):
     return out
 
 
-def merror(predt: np.ndarray, dtrain: xgb.DMatrix):
+def merror(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
     y = dtrain.get_label()
-    # Like custom objective, the predt is untransformed leaf weight when custom objective
-    # is provided.
+    # Like custom objective, the predt is untransformed leaf weight when custom
+    # objective is provided.
 
-    # With the use of `custom_metric` parameter in train function, custom metric receives
-    # raw input only when custom objective is also being used.  Otherwise custom metric
-    # will receive transformed prediction.
+    # With the use of `custom_metric` parameter in train function, custom metric
+    # receives raw input only when custom objective is also being used.  Otherwise
+    # custom metric will receive transformed prediction.
     assert predt.shape == (kRows, kClasses)
     out = np.zeros(kRows)
     for r in range(predt.shape[0]):
@@ -119,68 +120,91 @@ def merror(predt: np.ndarray, dtrain: xgb.DMatrix):
 
     errors = np.zeros(kRows)
     errors[y != out] = 1.0
-    return 'PyMError', np.sum(errors) / kRows
+    return "PyMError", np.sum(errors) / kRows
 
 
-def plot_history(custom_results, native_results):
-    fig, axs = plt.subplots(2, 1)
+def plot_history(
+    custom_results: Dict[str, Dict], native_results: Dict[str, Dict]
+) -> None:
+    axs: np.ndarray
+    fig, axs = plt.subplots(2, 1)  # type: ignore
     ax0 = axs[0]
     ax1 = axs[1]
 
-    pymerror = custom_results['train']['PyMError']
-    merror = native_results['train']['merror']
+    pymerror = custom_results["train"]["PyMError"]
+    merror = native_results["train"]["merror"]
 
     x = np.arange(0, kRounds, 1)
-    ax0.plot(x, pymerror, label='Custom objective')
+    ax0.plot(x, pymerror, label="Custom objective")
     ax0.legend()
-    ax1.plot(x, merror, label='multi:softmax')
+    ax1.plot(x, merror, label="multi:softmax")
     ax1.legend()
 
     plt.show()
 
 
-def main(args):
-    custom_results = {}
+def main(args: argparse.Namespace) -> None:
+    # Since 3.1, XGBoost can estimate the base_score automatically for built-in
+    # multi-class objectives.
+    #
+    # We explicitly specify it here to disable the automatic estimation to have a proper
+    # comparison between the custom implementation and the built-in implementation.
+    intercept = np.full(shape=(kClasses,), fill_value=1 / kClasses)
+
+    custom_results: Dict[str, Dict] = {}
     # Use our custom objective function
-    booster_custom = xgb.train({'num_class': kClasses,
-                                'disable_default_eval_metric': True},
-                               m,
-                               num_boost_round=kRounds,
-                               obj=softprob_obj,
-                               custom_metric=merror,
-                               evals_result=custom_results,
-                               evals=[(m, 'train')])
+    booster_custom = xgb.train(
+        {
+            "num_class": kClasses,
+            "base_score": intercept,
+            "disable_default_eval_metric": True,
+        },
+        m,
+        num_boost_round=kRounds,
+        obj=softprob_obj,
+        custom_metric=merror,
+        evals_result=custom_results,
+        evals=[(m, "train")],
+    )
 
     predt_custom = predict(booster_custom, m)
 
-    native_results = {}
+    native_results: Dict[str, Dict] = {}
     # Use the same objective function defined in XGBoost.
-    booster_native = xgb.train({'num_class': kClasses,
-                                "objective": "multi:softmax",
-                                'eval_metric': 'merror'},
-                               m,
-                               num_boost_round=kRounds,
-                               evals_result=native_results,
-                               evals=[(m, 'train')])
+    booster_native = xgb.train(
+        {
+            "num_class": kClasses,
+            "base_score": intercept,
+            "objective": "multi:softmax",
+            "eval_metric": "merror",
+        },
+        m,
+        num_boost_round=kRounds,
+        evals_result=native_results,
+        evals=[(m, "train")],
+    )
     predt_native = booster_native.predict(m)
 
     # We are reimplementing the loss function in XGBoost, so it should
     # be the same for normal cases.
     assert np.all(predt_custom == predt_native)
-    np.testing.assert_allclose(custom_results['train']['PyMError'],
-                               native_results['train']['merror'])
+    np.testing.assert_allclose(
+        custom_results["train"]["PyMError"], native_results["train"]["merror"]
+    )
 
     if args.plot != 0:
         plot_history(custom_results, native_results)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='Arguments for custom softmax objective function demo.')
+        description="Arguments for custom softmax objective function demo."
+    )
     parser.add_argument(
-        '--plot',
+        "--plot",
         type=int,
         default=1,
-        help='Set to 0 to disable plotting the evaluation history.')
+        help="Set to 0 to disable plotting the evaluation history.",
+    )
     args = parser.parse_args()
     main(args)
diff --git a/demo/guide-python/model_parser.py b/demo/guide-python/model_parser.py
index 39a459613409..dd4961bda7f5 100644
--- a/demo/guide-python/model_parser.py
+++ b/demo/guide-python/model_parser.py
@@ -154,7 +154,9 @@ def __init__(self, model: dict) -> None:
         self.learner_model_shape: ParamT = model["learner"]["learner_model_param"]
         self.num_output_group = int(self.learner_model_shape["num_class"])
         self.num_feature = int(self.learner_model_shape["num_feature"])
-        self.base_score = float(self.learner_model_shape["base_score"])
+        self.base_score: List[float] = json.loads(
+            self.learner_model_shape["base_score"]
+        )
         # A field encoding which output group a tree belongs
         self.tree_info = model["learner"]["gradient_booster"]["model"]["tree_info"]
 
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 0e518e1b2f81..702c7cf63ba4 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -419,13 +419,18 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
 * ``base_score``
 
-  - The initial prediction score of all instances, global bias.
+  The initial prediction score of all instances, also known as the global bias, or the intercept.
+
+  .. versionchanged:: 3.1.0
+
+    XGBoost is updated to use vector-valued intercept by default.
+
   - The parameter is automatically estimated for selected objectives before training. To
-    disable the estimation, specify a real number argument.
-  - If ``base_margin`` is supplied, ``base_score`` will not be added.
-  - For sufficient number of iterations, changing this value will not have too much effect.
+    disable the estimation, specify a real number argument, e.g. ``base_score = 0.5``.
+  - If ``base_margin`` is supplied, ``base_score`` will not be used.
+  - If we train the model with a sufficient number of iterations, changing this value does not offer significant benefit.
 
-  See :doc:`/tutorials/intercept` for more info.
+  See :doc:`/tutorials/intercept` for more information, including different use cases.
 
 * ``eval_metric`` [default according to objective]
 
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 6f464c03b3aa..24ff72f77d8d 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -305,6 +305,9 @@ struct LearnerModelParam {
    */
   linalg::Vector<float> base_score_;
 
+  LearnerModelParam(LearnerModelParamLegacy const& user_param, ObjInfo t,
+                    MultiStrategy multi_strategy);
+
  public:
   /**
    * @brief The number of features.
@@ -324,12 +327,9 @@ struct LearnerModelParam {
   MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
 
   LearnerModelParam() = default;
-  // As the old `LearnerModelParamLegacy` is still used by binary IO, we keep
-  // this one as an immutable copy.
   LearnerModelParam(Context const* ctx, LearnerModelParamLegacy const& user_param,
                     linalg::Vector<float> base_score, ObjInfo t, MultiStrategy multi_strategy);
-  LearnerModelParam(LearnerModelParamLegacy const& user_param, ObjInfo t,
-                    MultiStrategy multi_strategy);
+  // This ctor is only used by tests.
   LearnerModelParam(bst_feature_t n_features, linalg::Vector<float> base_score,
                     std::uint32_t n_groups, bst_target_t n_targets, MultiStrategy multi_strategy)
       : base_score_{std::move(base_score)},
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 9593f280807e..848a248048eb 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -591,13 +591,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span<T, ext> data,
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
+  auto span = ctx->IsCUDA() ? data->DeviceSpan() : data->HostSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
+  auto span = ctx->IsCUDA() ? data->ConstDeviceSpan() : data->ConstHostSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index dd1cb6fa7dec..624218e22123 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -81,9 +81,9 @@ class ObjFunction : public Configurable {
    *   gradient boosting. The method converts objective-based valid outputs like
    *   probability back to raw model outputs.
    *
-   * @return transformed value
+   * @param [in,out] base_score The intercept to transform.
    */
-  [[nodiscard]] virtual float ProbToMargin(float base_score) const { return base_score; }
+  virtual void ProbToMargin(linalg::Vector<float>* /*base_score*/) const {}
   /**
    * @brief Obtain the initial estimation of prediction (intercept).
    *
diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py
index e47b7977beb8..f880f9703a81 100644
--- a/ops/script/lint_python.py
+++ b/ops/script/lint_python.py
@@ -61,6 +61,7 @@ class LintersPaths:
         "demo/guide-python/categorical.py",
         "demo/guide-python/cat_pipeline.py",
         "demo/guide-python/cross_validation.py",
+        "demo/guide-python/custom_softmax.py",
         "demo/guide-python/feature_weights.py",
         "demo/guide-python/model_parser.py",
         "demo/guide-python/sklearn_parallel.py",
@@ -126,6 +127,7 @@ class LintersPaths:
         "tests/test_distributed/test_gpu_with_dask/",
         # demo
         "demo/dask/",
+        "demo/guide-python/custom_softmax.py",
         "demo/guide-python/external_memory.py",
         "demo/guide-python/distributed_extmem_basic.py",
         "demo/guide-python/sklearn_examples.py",
diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc
index 5d61e812ac9b..86f941945518 100644
--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@@ -1,13 +1,14 @@
 /**
- * Copyright 2015-2023, XGBoost Contributors
+ * Copyright 2015-2025, XGBoost Contributors
  * \file custom_metric.cc
  * \brief This is an example to define plugin of xgboost.
  *  This plugin defines the additional metric function.
  */
 #include <xgboost/base.h>
-#include <xgboost/parameter.h>
-#include <xgboost/objective.h>
 #include <xgboost/json.h>
+#include <xgboost/linalg.h>  // for Vector
+#include <xgboost/objective.h>
+#include <xgboost/parameter.h>
 
 namespace xgboost::obj {
 // This is a helpful data structure to define parameters
@@ -62,9 +63,12 @@ class MyLogistic : public ObjFunction {
       pred = 1.0f / (1.0f + std::exp(-pred));
     }
   }
-  [[nodiscard]] float ProbToMargin(float base_score) const override {
+  void ProbToMargin(linalg::Vector<float>* base_score) const override {
     // transform probability to margin value
-    return -std::log(1.0f / base_score - 1.0f);
+    auto h_intercept = base_score->HostView();
+    for (std::size_t i = 0, n = h_intercept.Size(); i < n; ++i) {
+      h_intercept(i) = -std::log(1.0f / h_intercept(i) - 1.0f);
+    }
   }
 
   void SaveConfig(Json* p_out) const override {
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index d1195625d90e..26c5f80ef6f8 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -202,38 +202,6 @@ float GetLeafWeight(const Node* nodes, const float* fval_buff) {
 
 class Predictor : public xgboost::Predictor {
  public:
-  void InitOutPredictions(const MetaInfo& info,
-                          HostDeviceVector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const override {
-    device_model.SetDevice(ctx_->Device());
-    CHECK_NE(model.learner_model_param->num_output_group, 0);
-    size_t n = model.learner_model_param->num_output_group * info.num_row_;
-    size_t base_margin_size = info.base_margin_.Data()->Size();
-    out_preds->Resize(n);
-    if (base_margin_size == n) {
-      CHECK_EQ(out_preds->Size(), n);
-      out_preds->Copy(*(info.base_margin_.Data()));
-    } else {
-      auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
-      if (base_margin_size > 0) {
-        std::ostringstream oss;
-        oss << "Ignoring the base margin, since it has incorrect length. "
-            << "The base margin must be an array of length ";
-        if (model.learner_model_param->num_output_group > 1) {
-          oss << "[num_class] * [number of data points], i.e. "
-              << model.learner_model_param->num_output_group << " * " << info.num_row_
-              << " = " << n << ". ";
-        } else {
-          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
-        }
-        oss << "Instead, all data points will use "
-            << "base_score = " << base_score;
-        LOG(WARNING) << oss.str();
-      }
-      out_preds->Fill(base_score);
-    }
-  }
-
   explicit Predictor(Context const* context) :
       xgboost::Predictor::Predictor{context},
       cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)),
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 0aa4e39ee8af..6caab146e834 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2402,6 +2402,8 @@ def set_param(
         for key, val in cast(Iterable[Tuple[str, str]], params):
             if isinstance(val, np.ndarray):
                 val = val.tolist()
+            elif hasattr(val, "__cuda_array_interface__") and hasattr(val, "tolist"):
+                val = val.tolist()
             if val is not None:
                 _check_call(
                     _LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val)))
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 589c696c0009..48cbc003cc12 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -335,7 +335,7 @@ def task(i: int) -> float:
     scale_pos_weight : {Optional[float]}
         Balancing of positive and negative weights.
 
-    base_score : {Optional[float]}
+    base_score : {Optional[Union[float, List[float]]]}
 
         The initial prediction score of all instances, global bias.
 
@@ -826,7 +826,7 @@ def __init__(
         reg_alpha: Optional[float] = None,
         reg_lambda: Optional[float] = None,
         scale_pos_weight: Optional[float] = None,
-        base_score: Optional[float] = None,
+        base_score: Optional[Union[float, List[float]]] = None,
         random_state: Optional[
             Union[np.random.RandomState, np.random.Generator, int]
         ] = None,
@@ -1148,7 +1148,9 @@ def _load_model_attributes(self, config: dict) -> None:
 
         self.objective = config["learner"]["objective"]["name"]
         self.booster = config["learner"]["gradient_booster"]["name"]
-        self.base_score = config["learner"]["learner_model_param"]["base_score"]
+        self.base_score = json.loads(
+            config["learner"]["learner_model_param"]["base_score"]
+        )
         self.feature_types = booster.feature_types
 
         if is_classifier(self):
@@ -1650,8 +1652,10 @@ def intercept_(self) -> np.ndarray:
         b = self.get_booster()
         if booster_config != "gblinear":  # gbtree, dart
             config = json.loads(b.save_config())
-            intercept = config["learner"]["learner_model_param"]["base_score"]
-            return np.array([float(intercept)], dtype=np.float32)
+            intercept = json.loads(
+                config["learner"]["learner_model_param"]["base_score"]
+            )
+            return np.array(intercept, dtype=np.float32)
 
         return np.array(
             json.loads(b.get_dump(dump_format="json")[0])["bias"], dtype=np.float32
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index d23e9dbcd9fd..d0912262cce4 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -3,7 +3,7 @@
 import json
 from functools import partial, update_wrapper
 from string import ascii_lowercase
-from typing import Any, Dict, List, Union, overload
+from typing import Any, Dict, List, Optional, Union, overload
 
 import numpy as np
 import pytest
@@ -19,24 +19,34 @@
 
 
 @overload
-def get_basescore(model: xgb.XGBModel) -> float: ...
+def get_basescore(model: xgb.XGBModel) -> List[float]: ...
 
 
 @overload
-def get_basescore(model: xgb.Booster) -> float: ...
+def get_basescore(model: xgb.Booster) -> List[float]: ...
 
 
-def get_basescore(model: Union[xgb.XGBModel, xgb.Booster]) -> float:
+@overload
+def get_basescore(model: Dict[str, Any]) -> List[float]: ...
+
+
+def get_basescore(
+    model: Union[xgb.XGBModel, xgb.Booster, Dict],
+) -> List[float]:
     """Get base score from an XGBoost sklearn estimator."""
     if isinstance(model, xgb.XGBModel):
         model = model.get_booster()
 
-    base_score = float(
-        json.loads(model.save_config())["learner"]["learner_model_param"]["base_score"]
-    )
-    return base_score
+    if isinstance(model, dict):
+        jintercept = model["learner"]["learner_model_param"]["base_score"]
+    else:
+        jintercept = json.loads(model.save_config())["learner"]["learner_model_param"][
+            "base_score"
+        ]
+    return json.loads(jintercept)
 
 
+# pylint: disable=too-many-statements
 def check_init_estimation(tree_method: str, device: Device) -> None:
     """Test for init estimation."""
     from sklearn.datasets import (
@@ -53,17 +63,19 @@ def run_reg(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-na
         base_score_0 = get_basescore(reg)
         score_0 = reg.evals_result()["validation_0"]["rmse"][0]
 
+        n_targets = 1 if y.ndim == 1 else y.shape[1]
+        intercept = np.full(shape=(n_targets,), fill_value=0.5, dtype=np.float32)
         reg = xgb.XGBRegressor(
             tree_method=tree_method,
             device=device,
             max_depth=1,
             n_estimators=1,
-            boost_from_average=0,
+            base_score=intercept,
         )
         reg.fit(X, y, eval_set=[(X, y)])
         base_score_1 = get_basescore(reg)
         score_1 = reg.evals_result()["validation_0"]["rmse"][0]
-        assert not np.isclose(base_score_0, base_score_1)
+        assert not np.isclose(base_score_0, base_score_1).any()
         assert score_0 < score_1  # should be better
 
     # pylint: disable=unbalanced-tuple-unpacking
@@ -73,26 +85,49 @@ def run_reg(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-na
     X, y = make_regression(n_samples=4096, n_targets=3, random_state=17)
     run_reg(X, y)
 
-    def run_clf(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-name
+    # pylint: disable=invalid-name
+    def run_clf(
+        X: np.ndarray, y: np.ndarray, w: Optional[np.ndarray] = None
+    ) -> List[float]:
         clf = xgb.XGBClassifier(
             tree_method=tree_method, max_depth=1, n_estimators=1, device=device
         )
-        clf.fit(X, y, eval_set=[(X, y)])
+        if w is not None:
+            clf.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+        else:
+            clf.fit(X, y, eval_set=[(X, y)])
         base_score_0 = get_basescore(clf)
-        score_0 = clf.evals_result()["validation_0"]["logloss"][0]
+        if clf.n_classes_ == 2:
+            score_0 = clf.evals_result()["validation_0"]["logloss"][0]
+        else:
+            score_0 = clf.evals_result()["validation_0"]["mlogloss"][0]
 
+        n_targets = 1 if y.ndim == 1 else y.shape[1]
+        intercept = np.full(shape=(n_targets,), fill_value=0.5, dtype=np.float32)
         clf = xgb.XGBClassifier(
             tree_method=tree_method,
             max_depth=1,
             n_estimators=1,
             device=device,
-            boost_from_average=0,
+            base_score=intercept,
         )
-        clf.fit(X, y, eval_set=[(X, y)])
+        if w is not None:
+            clf.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+        else:
+            clf.fit(X, y, eval_set=[(X, y)])
         base_score_1 = get_basescore(clf)
-        score_1 = clf.evals_result()["validation_0"]["logloss"][0]
-        assert not np.isclose(base_score_0, base_score_1)
-        assert score_0 < score_1  # should be better
+        if clf.n_classes_ == 2:
+            score_1 = clf.evals_result()["validation_0"]["logloss"][0]
+        else:
+            score_1 = clf.evals_result()["validation_0"]["mlogloss"][0]
+        assert not np.isclose(base_score_0, base_score_1).any()
+        assert score_0 < score_1 + 1e-4  # should be better
+
+        return base_score_0
 
     # pylint: disable=unbalanced-tuple-unpacking
     X, y = make_classification(n_samples=4096, random_state=17)
@@ -102,6 +137,26 @@ def run_clf(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-na
     )
     run_clf(X, y)
 
+    X, y = make_classification(
+        n_samples=4096, random_state=17, n_classes=5, n_informative=20, n_redundant=0
+    )
+    intercept = run_clf(X, y)
+    np.testing.assert_allclose(np.sum(intercept), 1.0)
+    assert np.all(np.array(intercept) > 0)
+    np_int = (
+        np.histogram(
+            y, bins=np.concatenate([np.unique(y), np.array([np.finfo(np.float32).max])])
+        )[0]
+        / y.shape[0]
+    )
+    np.testing.assert_allclose(intercept, np_int)
+
+    rng = np.random.default_rng(1994)
+    w = rng.uniform(low=0, high=1, size=(y.shape[0],))
+    intercept = run_clf(X, y, w)
+    np.testing.assert_allclose(np.sum(intercept), 1.0)
+    assert np.all(np.array(intercept) > 0)
+
 
 # pylint: disable=too-many-locals
 def check_quantile_loss(tree_method: str, weighted: bool, device: Device) -> None:
@@ -114,9 +169,7 @@ def check_quantile_loss(tree_method: str, weighted: bool, device: Device) -> Non
     n_samples = 4096
     n_features = 8
     n_estimators = 8
-    # non-zero base score can cause floating point difference with GPU predictor.
-    # multi-class has small difference than single target in the prediction kernel
-    base_score = 0.0
+
     rng = np.random.RandomState(1994)
     # pylint: disable=unbalanced-tuple-unpacking
     X, y = make_regression(
@@ -132,6 +185,9 @@ def check_quantile_loss(tree_method: str, weighted: bool, device: Device) -> Non
     Xy = xgb.QuantileDMatrix(X, y, weight=weight)
 
     alpha = np.array([0.1, 0.5])
+    # non-zero base score can cause floating point difference with GPU predictor.
+    # multi-class has small difference than single target in the prediction kernel
+    base_score = np.zeros(shape=alpha.shape, dtype=np.float32)
     evals_result: Dict[str, Dict] = {}
     booster_multi = xgb.train(
         {
@@ -171,7 +227,7 @@ def check_quantile_loss(tree_method: str, weighted: bool, device: Device) -> Non
                 "tree_method": tree_method,
                 "device": device,
                 "quantile_alpha": a,
-                "base_score": base_score,
+                "base_score": base_score[i],
             },
             Xy,
             num_boost_round=n_estimators,
@@ -699,15 +755,12 @@ def run_adaptive(tree_method: str, weighted: bool, device: Device) -> None:
     config_0 = json.loads(booster_0.save_config())
     config_1 = json.loads(booster_1.save_config())
 
-    def get_score(config: Dict) -> float:
-        return float(config["learner"]["learner_model_param"]["base_score"])
-
-    assert get_score(config_0) == get_score(config_1)
+    assert get_basescore(config_0) == get_basescore(config_1)
 
     raw_booster = booster_1.save_raw(raw_format="ubj")
     booster_2 = xgb.Booster(model_file=raw_booster)
     config_2 = json.loads(booster_2.save_config())
-    assert get_score(config_1) == get_score(config_2)
+    assert get_basescore(config_1) == get_basescore(config_2)
 
     booster_0 = xgb.train(
         {
@@ -720,7 +773,9 @@ def get_score(config: Dict) -> float:
         num_boost_round=1,
     )
     config_0 = json.loads(booster_0.save_config())
-    np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
+    np.testing.assert_allclose(
+        get_basescore(config_0), np.asarray(get_basescore(config_1)) + 1
+    )
 
     evals_result: Dict[str, Dict[str, list]] = {}
     xgb.train(
diff --git a/python-package/xgboost/testing/with_skl.py b/python-package/xgboost/testing/with_skl.py
index cc78b95b00df..04851ffddf12 100644
--- a/python-package/xgboost/testing/with_skl.py
+++ b/python-package/xgboost/testing/with_skl.py
@@ -8,9 +8,9 @@
 
 from ..core import DMatrix
 from ..sklearn import XGBClassifier, XGBRegressor, XGBRFRegressor
-from .data import get_california_housing
+from .data import get_california_housing, make_batches
 from .ordinal import make_recoded
-from .utils import Device
+from .utils import Device, assert_allclose
 
 
 def run_boost_from_prediction_binary(
@@ -164,3 +164,68 @@ def run_recoding(device: Device) -> None:
 
     np.testing.assert_allclose(reg.predict(reenc), reg.predict(enc))
     np.testing.assert_allclose(reg.apply(reenc), reg.apply(enc))
+
+
+def run_intercept(device: Device) -> None:
+    """Tests for the intercept."""
+    from sklearn.datasets import make_classification, make_multilabel_classification
+
+    X, y, w = [v[0] for v in make_batches(256, 3, 1, use_cupy=False)]
+    reg = XGBRegressor(device=device)
+    reg.fit(X, y, sample_weight=w)
+    result = reg.intercept_
+    assert result.dtype == np.float32
+    assert result[0] < 0.5
+
+    reg = XGBRegressor(booster="gblinear", device=device)
+    reg.fit(X, y, sample_weight=w)
+    result = reg.intercept_
+    assert isinstance(result, np.ndarray)
+    assert result.dtype == np.float32
+    assert result[0] < 0.5
+
+    n_classes = 4
+    X, y = make_classification(
+        random_state=1994,
+        n_samples=128,
+        n_features=16,
+        n_classes=n_classes,
+        n_informative=16,
+        n_redundant=0,
+    )
+
+    clf = XGBClassifier(booster="gbtree", objective="multi:softprob", device=device)
+    clf.fit(X, y)
+    result = clf.intercept_
+    assert isinstance(result, np.ndarray)
+    assert len(result) == 4
+    assert (result >= 0.0).all()
+    np.testing.assert_allclose(sum(result), 1.0)
+
+    # Tests for user input
+    # Multi-class
+    intercept = np.ones(shape=(n_classes), dtype=np.float32) / n_classes
+    if device == "cuda":
+        import cupy as cp
+
+        intercept = cp.array(intercept)
+
+    clf = XGBClassifier(objective="multi:softprob", base_score=intercept)
+    clf.fit(X, y)
+    assert_allclose(device, intercept, clf.intercept_)
+
+    X, y = make_multilabel_classification(  # pylint: disable=unbalanced-tuple-unpacking
+        random_state=1994, n_samples=128, n_features=16, n_classes=n_classes
+    )
+
+    # Multi-label
+    intercept = np.ones(shape=(n_classes), dtype=np.float32) / 2
+    if device == "cuda":
+        import cupy as cp
+
+        intercept = cp.array(intercept)
+
+    clf = XGBClassifier(base_score=intercept)
+    clf.fit(X, y)
+    assert_allclose(device, intercept, clf.intercept_)
+    assert clf.objective == "binary:logistic"
diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index 744ce8c2a871..0411dbe605db 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -160,6 +160,20 @@ template <typename T, std::int32_t kDim>
   return GlobalSum(ctx, info.IsColumnSplit(), values);
 }
 
+template <typename T>
+[[nodiscard]] Result GlobalSum(Context const* ctx, MetaInfo const& info,
+                               linalg::VectorView<T> values, double* sum_weight) {
+  if (info.IsColumnSplit()) {
+    return Success();
+  }
+  auto status = Success() << [&] {
+    return Allreduce(ctx, sum_weight, collective::Op::kSum);
+  } << [&] {
+    return Allreduce(ctx, values, collective::Op::kSum);
+  };
+  return status;
+}
+
 /**
  * @brief Find the global ratio of the given two values across all workers.
  *
diff --git a/src/collective/coll.cu b/src/collective/coll.cu
index 359a87164daa..83f88eaaf8fe 100644
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #if defined(XGBOOST_USE_NCCL)
 #include <chrono>       // for chrono, chrono_literals
@@ -13,9 +13,9 @@
 #include <type_traits>  // for invoke_result_t, is_same_v, enable_if_t
 #include <utility>      // for move
 
-#include "../common/cleanup.h"           // for Cleanup
 #include "../common/device_helpers.cuh"  // for CUDAStreamView, CUDAEvent, device_vector
 #include "../common/threadpool.h"        // for ThreadPool
+#include "../common/utils.h"             // for MakeCleanup
 #include "../data/array_interface.h"     // for ArrayInterfaceHandler
 #include "allgather.h"                   // for AllgatherVOffset
 #include "coll.cuh"                      // for NCCLColl
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 23e7f0fc02d2..c2a2985cf7df 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -74,4 +74,17 @@ void CheckOldNccl(std::int32_t major, std::int32_t minor, std::int32_t patch) {
   auto err = std::error_code{errsv, std::system_category()};
   return err;
 }
+
+void InvalidIntercept(std::int32_t n_classes, bst_target_t n_targets, std::size_t intercept_len) {
+  std::stringstream ss;
+  ss << "Invalid `base_score`, it should match the number of outputs for multi-class/target "
+     << "models. `base_score` len: " << intercept_len;
+  if (n_classes > 1) {
+    ss << ", `n_classes`: " << n_classes;
+  }
+  if (n_targets > 1) {
+    ss << ", `n_targets`: " << n_targets;
+  }
+  LOG(FATAL) << ss.str();
+}
 }  // namespace xgboost::error
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index d1dfc84d8e4d..5f6ae37cae2a 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -141,5 +141,7 @@ constexpr StringView CacheHostRatioInvalid() {
 }
 
 [[nodiscard]] std::error_code SystemError();
+
+void InvalidIntercept(std::int32_t n_classes, bst_target_t n_targets, std::size_t intercept_len);
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/linalg_op.cc b/src/common/linalg_op.cc
new file mode 100644
index 000000000000..4a68fedf37e7
--- /dev/null
+++ b/src/common/linalg_op.cc
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include "linalg_op.h"
+
+#include <cstddef>  // for size_t
+
+#include "optional_weight.h"  // for OptionalWeights
+#include "xgboost/context.h"  // for Context
+
+#if !defined(XGBOOST_USE_CUDA)
+#include "common.h"  // for AssertGPUSupport
+#endif
+
+namespace xgboost::linalg {
+namespace cuda_impl {
+void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
+                    common::OptionalWeights const& weights, linalg::VectorView<float> bins);
+#if !defined(XGBOOST_USE_CUDA)
+void SmallHistogram(Context const*, linalg::MatrixView<float const>, common::OptionalWeights const&,
+                    linalg::VectorView<float>) {
+  common::AssertGPUSupport();
+}
+#endif
+}  // namespace cuda_impl
+
+void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
+                    common::OptionalWeights const& weights, linalg::VectorView<float> bins) {
+  auto n = indices.Size();
+  if (!ctx->IsCUDA()) {
+    for (std::size_t i = 0; i < n; ++i) {
+      auto y = indices(i);
+      auto w = weights[i];
+      bins(static_cast<std::size_t>(y)) += w;
+    }
+  } else {
+    cuda_impl::SmallHistogram(ctx, indices, weights, bins);
+  }
+}
+}  // namespace xgboost::linalg
diff --git a/src/common/linalg_op.cu b/src/common/linalg_op.cu
new file mode 100644
index 000000000000..7db3316deb77
--- /dev/null
+++ b/src/common/linalg_op.cu
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <thrust/for_each.h>                    // for for_each_n
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/scan.h>                        // for inclusive_scan
+
+#include <cstddef>  // for size_t
+
+#include "algorithm.cuh"       // for ArgSort, RunLengthEncode
+#include "device_helpers.cuh"  // for MakeIndexTransformIter
+#include "device_vector.cuh"   // for DeviceUVector
+#include "linalg_op.cuh"
+#include "optional_weight.h"  // for OptionalWeights
+#include "xgboost/linalg.h"   // for VectorView
+
+namespace xgboost::linalg::cuda_impl {
+void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul) {
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), x.Size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) mutable { x(i) = x(i) * mul; });
+}
+
+void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
+                    common::OptionalWeights const& d_weights, linalg::VectorView<float> bins) {
+  auto n_bins = bins.Size();
+  auto cuctx = ctx->CUDACtx();
+  // Sort for segmented sum
+  dh::DeviceUVector<std::size_t> sorted_idx(indices.Size());
+  common::ArgSort<true>(ctx, indices.Values(), dh::ToSpan(sorted_idx));
+  auto d_sorted_idx = dh::ToSpan(sorted_idx);
+
+  auto key_it = dh::MakeIndexTransformIter(
+      [=] XGBOOST_DEVICE(std::size_t i) { return indices(d_sorted_idx[i]); });
+
+  dh::device_vector<std::size_t> counts_out(n_bins + 1, 0);
+  // Obtain the segment boundaries for the segmented sum.
+  dh::DeviceUVector<float> unique(n_bins);
+  dh::CachingDeviceUVector<std::size_t> num_runs(1);
+  common::RunLengthEncode(cuctx->Stream(), key_it, unique.begin(), counts_out.begin() + 1,
+                          num_runs.begin(), indices.Size());
+  thrust::inclusive_scan(cuctx->CTP(), counts_out.begin(), counts_out.end(), counts_out.begin());
+
+  auto val_it = dh::MakeIndexTransformIter(
+      [=] XGBOOST_DEVICE(std::size_t i) { return d_weights[d_sorted_idx[i]]; });
+  // Sum weighted-label for each class to acc, counts_out is the segment ptr after inclusive_scan
+  common::SegmentedSum(cuctx->Stream(), val_it, linalg::tbegin(bins), n_bins, counts_out.cbegin(),
+                       counts_out.cbegin() + 1);
+}
+}  // namespace xgboost::linalg::cuda_impl
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index a2ff61d957ab..9a15cfc4d95a 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -8,6 +8,7 @@
 #include <cstdlib>  // for size_t
 #include <tuple>    // for apply
 
+#include "cuda_context.cuh"
 #include "device_helpers.cuh"  // for LaunchN
 #include "linalg_op.h"
 #include "xgboost/context.h"  // for Context
@@ -41,6 +42,8 @@ void ElementWiseKernel(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
   dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
   cuda_impl::ElementWiseImpl<T, D>{}(t, fn, s);
 }
+
+void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul);
 }  // namespace cuda_impl
 
 template <typename T, int32_t D, typename Fn>
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index 605bc6d2975a..889747ccb3dc 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -22,6 +22,10 @@
 
 #endif  // !defined(XGBOOST_USE_CUDA)
 
+namespace xgboost::common {
+struct OptionalWeights;
+}
+
 namespace xgboost::linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseTransformHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& fn) {
@@ -110,6 +114,34 @@ auto end(TensorView<T, kDim>& v) {  // NOLINT
   return begin(v) + v.Size();
 }
 
+namespace cuda_impl {
+void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul);
+}  // namespace cuda_impl
+
+// vector-scalar multiplication
+inline void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul) {
+  CHECK_EQ(x.Device().ordinal, ctx->Device().ordinal);
+  if (x.Device().IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    cuda_impl::VecScaMul(ctx, x, mul);
+#else
+    common::AssertGPUSupport();
+#endif
+  } else {
+    constexpr std::size_t kBlockSize = 2048;
+    common::ParallelFor1d<kBlockSize>(x.Size(), ctx->Threads(), [&](auto&& block) {
+      for (auto i = block.begin(); i < block.end(); ++i) {
+        x(i) *= mul;
+      }
+    });
+  }
+}
+
+// vector-scalar division
+inline void VecScaDiv(Context const* ctx, linalg::VectorView<float> x, double div) {
+  return VecScaMul(ctx, x, 1.0 / div);
+}
+
 template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
 void SaveVector(linalg::Vector<T> const& in, Json* p_out) {
   ::xgboost::SaveVector(in.Data()->HostVector(), p_out);
@@ -119,5 +151,8 @@ template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
 void LoadVector(Json const& in, linalg::Vector<T>* out) {
   ::xgboost::LoadVector(in, &out->Data()->HostVector());
 }
+
+void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
+                    common::OptionalWeights const& weights, linalg::VectorView<float> bins);
 }  // namespace xgboost::linalg
 #endif  // XGBOOST_COMMON_LINALG_OP_H_
diff --git a/src/common/math.h b/src/common/math.h
index 3c64ec39937b..878870fbf2db 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -60,13 +60,13 @@ XGBOOST_DEVICE constexpr bool CloseTo(T a, U b) {
  * \param end end iterator of input
  */
 template <typename Iterator>
-XGBOOST_DEVICE inline void Softmax(Iterator start, Iterator end) {
+XGBOOST_DEVICE void Softmax(Iterator start, Iterator end) {
   static_assert(
       std::is_same_v<
           float, typename std::remove_reference_t<decltype(std::declval<Iterator>().operator*())>>,
-      "Values should be of type bst_float");
-  bst_float wmax = *start;
-  for (Iterator i = start+1; i != end; ++i) {
+      "Values should be of type float");
+  float wmax = *start;
+  for (Iterator i = start + 1; i != end; ++i) {
     wmax = fmaxf(*i, wmax);
   }
   double wsum = 0.0f;
diff --git a/src/common/optional_weight.cc b/src/common/optional_weight.cc
new file mode 100644
index 000000000000..40bb1bff4636
--- /dev/null
+++ b/src/common/optional_weight.cc
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include "optional_weight.h"
+
+#include <numeric>  // for accumulate
+
+#include "xgboost/base.h"     // for bst_idx_t
+#include "xgboost/context.h"  // for Context
+
+#if !defined(XGBOOST_USE_CUDA)
+
+#include "common.h"  // for AssertGPUSupport
+
+#endif  // !defined(XGBOOST_USE_CUDA)
+
+namespace xgboost::common {
+#if defined(XGBOOST_USE_CUDA)
+namespace cuda_impl {
+double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights);
+}
+#endif
+
+[[nodiscard]] double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights,
+                                        bst_idx_t n_samples) {
+  if (weights.Empty()) {
+    return n_samples * weights.dft;
+  }
+  if (ctx->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    return cuda_impl::SumOptionalWeights(ctx, weights);
+#else
+    common::AssertGPUSupport();
+#endif
+  }
+  auto sum_weight = std::accumulate(weights.Data(), weights.Data() + weights.Size(), 0.0);
+  return sum_weight;
+}
+}  // namespace xgboost::common
diff --git a/src/common/optional_weight.cu b/src/common/optional_weight.cu
new file mode 100644
index 000000000000..7801eadea86e
--- /dev/null
+++ b/src/common/optional_weight.cu
@@ -0,0 +1,18 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <cuda/std/functional>  // for plus
+
+#include <cstddef>              // for size_t
+
+#include "cuda_context.cuh"
+#include "device_helpers.cuh"
+#include "optional_weight.h"
+#include "xgboost/context.h"  // for Context
+
+namespace xgboost::common::cuda_impl {
+double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) {
+  auto w_it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) { return weights[i]; });
+  return dh::Reduce(ctx->CUDACtx()->CTP(), w_it, w_it + weights.Size(), 0.0, cuda::std::plus{});
+}
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h
index bbfd365c8ff6..a42f79fc171d 100644
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -1,8 +1,11 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
 #define XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
+
+#include <cstddef>  // for size_t
+
 #include "xgboost/base.h"                // XGBOOST_DEVICE
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
@@ -16,9 +19,12 @@ struct OptionalWeights {
   explicit OptionalWeights(Span<float const> w) : weights{w} {}
   explicit OptionalWeights(float w) : dft{w} {}
 
-  XGBOOST_DEVICE float operator[](size_t i) const { return weights.empty() ? dft : weights[i]; }
+  XGBOOST_DEVICE float operator[](std::size_t i) const {
+    return weights.empty() ? dft : weights[i];
+  }
   [[nodiscard]] auto Empty() const { return weights.empty(); }
   [[nodiscard]] auto Size() const { return weights.size(); }
+  [[nodiscard]] auto Data() const { return weights.data(); }
 };
 
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
@@ -28,5 +34,8 @@ inline OptionalWeights MakeOptionalWeights(Context const* ctx,
   }
   return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()};
 }
+
+[[nodiscard]] double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights,
+                                        bst_idx_t n_samples);
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
diff --git a/src/common/param_array.cc b/src/common/param_array.cc
index e1ca83d48338..45fdc662a086 100644
--- a/src/common/param_array.cc
+++ b/src/common/param_array.cc
@@ -18,15 +18,9 @@
 namespace xgboost::common {
 
 namespace {
-template <bool scalar_compatible>
 std::ostream& WriteStream(std::ostream& os,
-                          const ParamArray<float, scalar_compatible>& array) {  // NOLINT
+                          const ParamArray<float>& array) {  // NOLINT
   auto const& t = array.Get();
-  if (scalar_compatible && t.size() == 1) {
-    os << Json{Number{array.Get().front()}};
-    return os;
-  }
-
   F32Array arr{t.size()};
   for (std::size_t i = 0; i < t.size(); ++i) {
     arr.Set(i, t[i]);
@@ -41,17 +35,12 @@ std::ostream& WriteStream(std::ostream& os,
 }
 }  // namespace
 
-std::ostream& operator<<(std::ostream& os, const ParamArray<float, false>& array) {  // NOLINT
-  return WriteStream(os, array);
-}
-
-std::ostream& operator<<(std::ostream& os, const ParamArray<float, true>& array) {  // NOLINT
+std::ostream& operator<<(std::ostream& os, const ParamArray<float>& array) {  // NOLINT
   return WriteStream(os, array);
 }
 
 namespace {
-template <bool scalar_compatible>
-std::istream& ReadStream(std::istream& is, ParamArray<float, scalar_compatible>& array) {  // NOLINT
+std::istream& ReadStream(std::istream& is, ParamArray<float>& array) {  // NOLINT
   auto& t = array.Get();
   t.clear();
   std::string str;
@@ -100,11 +89,7 @@ std::istream& ReadStream(std::istream& is, ParamArray<float, scalar_compatible>&
 }
 }  // namespace
 
-std::istream& operator>>(std::istream& is, ParamArray<float, false>& array) {  // NOLINT
-  return ReadStream(is, array);
-}
-
-std::istream& operator>>(std::istream& is, ParamArray<float, true>& array) {  // NOLINT
+std::istream& operator>>(std::istream& is, ParamArray<float>& array) {  // NOLINT
   return ReadStream(is, array);
 }
 }  // namespace xgboost::common
diff --git a/src/common/param_array.h b/src/common/param_array.h
index 7b0912498219..b20321dbc4f6 100644
--- a/src/common/param_array.h
+++ b/src/common/param_array.h
@@ -15,11 +15,8 @@ namespace xgboost::common {
 /**
  * @brief A shim to enable ADL for parameter parsing. Alternatively, we can put the stream
  * operators in std namespace, which seems to be less ideal.
- *
- * @tparam scalar_compatible To help avoid breaking change for parameters that are used to be
- * scalars.
  */
-template <typename T, bool scalar_compatible>
+template <typename T>
 class ParamArray {
   std::string name_;
   std::vector<T> values_;
@@ -27,6 +24,7 @@ class ParamArray {
  public:
   using size_type = typename decltype(values_)::size_type;              // NOLINT
   using const_reference = typename decltype(values_)::const_reference;  // NOLINT
+  using reference = typename decltype(values_)::reference;              // NOLINT
 
  public:
   ParamArray() = default;
@@ -44,6 +42,7 @@ class ParamArray {
   [[nodiscard]] std::vector<T>& Get() { return values_; }
   [[nodiscard]] std::vector<T> const& Get() const { return values_; }
   const_reference operator[](size_type i) const { return values_[i]; }
+  reference operator[](size_type i) { return values_[i]; }
   [[nodiscard]] bool empty() const { return values_.empty(); }       // NOLINT
   [[nodiscard]] std::size_t size() const { return values_.size(); }  // NOLINT
   [[nodiscard]] auto data() const { return values_.data(); }         // NOLINT
@@ -52,12 +51,16 @@ class ParamArray {
     return *this;
   }
   [[nodiscard]] StringView Name() const { return this->name_; }
+  [[nodiscard]] auto cbegin() const { return this->values_.cbegin(); }  // NOLINT
+  [[nodiscard]] auto cend() const { return this->values_.cend(); }      // NOLINT
+  [[nodiscard]] auto begin() { return this->values_.begin(); }          // NOLINT
+  [[nodiscard]] auto end() { return this->values_.end(); }              // NOLINT
+
+  void Resize(size_type n, T const& init) { this->values_.resize(n, init); }  // NOLINT
 };
 
 // For parsing array-based parameters inside DMLC parameter. Input can be a string to a
 // single float or a list of floats.
-std::ostream& operator<<(std::ostream& os, const ParamArray<float, false>& t);
-std::ostream& operator<<(std::ostream& os, const ParamArray<float, true>& t);
-std::istream& operator>>(std::istream& is, ParamArray<float, false>& t);
-std::istream& operator>>(std::istream& is, ParamArray<float, true>& t);
+std::ostream& operator<<(std::ostream& os, const ParamArray<float>& t);
+std::istream& operator>>(std::istream& is, ParamArray<float>& t);
 }  // namespace xgboost::common
diff --git a/src/common/quantile_loss_utils.h b/src/common/quantile_loss_utils.h
index ad1f96a15441..34e794d94ca4 100644
--- a/src/common/quantile_loss_utils.h
+++ b/src/common/quantile_loss_utils.h
@@ -4,18 +4,19 @@
 #ifndef XGBOOST_COMMON_QUANTILE_LOSS_UTILS_H_
 #define XGBOOST_COMMON_QUANTILE_LOSS_UTILS_H_
 
-#include <algorithm>  // std::all_of
-#include <vector>     // std::vector
+#include <algorithm>  // for all_of
 
-#include "param_array.h"
+#include "param_array.h"        // for ParamArray
 #include "xgboost/logging.h"    // CHECK
 #include "xgboost/parameter.h"  // XGBoostParameter
 
 namespace xgboost::common {
 struct QuantileLossParam : public XGBoostParameter<QuantileLossParam> {
-  ParamArray<float, false> quantile_alpha{"quantile_alpha"};
+  ParamArray<float> quantile_alpha{"quantile_alpha"};
   DMLC_DECLARE_PARAMETER(QuantileLossParam) {
-    DMLC_DECLARE_FIELD(quantile_alpha).describe("List of quantiles for quantile loss.");
+    DMLC_DECLARE_FIELD(quantile_alpha)
+        .describe("List of quantiles for quantile loss.")
+        .set_default(ParamArray<float>{"quantile_alpha"});
   }
   void Validate() const {
     CHECK(GetInitialised());
diff --git a/src/common/cleanup.h b/src/common/utils.h
similarity index 50%
rename from src/common/cleanup.h
rename to src/common/utils.h
index 13a637bd372f..b77315d10bf7 100644
--- a/src/common/cleanup.h
+++ b/src/common/utils.h
@@ -1,13 +1,14 @@
 /**
- * Copyright 2024, XGBoost Contributors
- *
- * @brief RAII guard, simplified version of absl::Cleanup
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #pragma once
 #include <functional>  // for function
 #include <utility>     // for forward
 
+#include "xgboost/base.h"
+
 namespace xgboost::common {
+/** @brief RAII guard, simplified version of absl::Cleanup . */
 class Cleanup {
   std::function<void()> cb_;
 
@@ -22,4 +23,22 @@ template <typename Callback>
 auto MakeCleanup(Callback&& cb) {
   return Cleanup{std::forward<Callback>(cb)};
 }
+
+template <typename R>
+struct NoOp {
+  R val;
+
+  explicit NoOp(R&& v) : val{std::forward<R>(v)} {}
+
+  template <typename... Args>
+  XGBOOST_DEVICE R operator()(Args&&...) const {
+    return val;
+  }
+};
+
+template <>
+struct NoOp<void> {
+  template <typename... Args>
+  XGBOOST_DEVICE void operator()(Args&&...) const {}
+};
 }  // namespace xgboost::common
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 8ca3deb9a40f..2047ad8f696a 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -851,9 +851,10 @@ class Dart : public GBTree {
         auto base_score = model_.learner_model_param->BaseScore(DeviceOrd::CPU());
         auto& h_predts = predts.predictions.HostVector();
         auto& h_out_predts = p_out_preds->predictions.HostVector();
+        CHECK_EQ(base_score.Size(), n_groups);
         common::ParallelFor(n_rows, ctx_->Threads(), [&](auto ridx) {
           const size_t offset = ridx * n_groups + group;
-          h_out_predts[offset] += (h_predts[offset] - base_score(0)) * w;
+          h_out_predts[offset] += (h_predts[offset] - base_score(group)) * w;
         });
       }
     }
diff --git a/src/gbm/gbtree.cu b/src/gbm/gbtree.cu
index 8c4a960904f4..4e80a55de82d 100644
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 
@@ -34,10 +34,10 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
                               float tree_w, size_t n_rows,
                               linalg::TensorView<float const, 1> base_score, bst_group_t n_groups,
                               bst_group_t group) {
-  CHECK_EQ(base_score.Size(), 1);
+  CHECK_EQ(base_score.Size(), n_groups);
   dh::LaunchN(n_rows, [=] XGBOOST_DEVICE(size_t ridx) {
     const size_t offset = ridx * n_groups + group;
-    out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
+    out_predts[offset] += (predts[offset] - base_score(group)) * tree_w;
   });
 }
 }  // namespace xgboost::gbm
diff --git a/src/learner.cc b/src/learner.cc
index 249775dd821c..a02691f160ce 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -83,10 +83,10 @@ T& UsePtr(T& ptr) {  // NOLINT
  */
 struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy> {
   /** @brief Global bias/intercept. */
-  common::ParamArray<float, true> base_score{"base_score", ObjFunction::DefaultBaseScore()};
+  common::ParamArray<float> base_score{"base_score"};
   /** @brief number of features  */
   bst_feature_t num_feature{0};
-  /** @brief number of classes, if it is multi-class classification  */
+  /** @brief number of classes, if it is multi-class classification, 0 otherwise.  */
   std::int32_t num_class{0};
   /**! @brief the version of XGBoost. */
   std::int32_t major_version{std::get<0>(Version::Self())};
@@ -153,24 +153,39 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     std::string str = get<String const>(j_param.at("base_score"));
     m["base_score"] = str;
     this->Init(m);
+    this->HandleOldFormat();
+  }
+  // Handle old model formats, before 3.1, the intercept was always a scalar.
+  void HandleOldFormat() {
+    if (this->base_score.size() == 1 && this->OutputLength() > 1) {
+      this->base_score.Resize(this->OutputLength(), this->base_score[0]);
+    }
   }
 
   template <typename Container>
   Args UpdateAllowUnknown(Container const& kwargs) {
     // Detect whether user has made their own base score.
-    auto find_key = [&kwargs](char const* key) {
+    auto has_key = [&kwargs](char const* key) {
       return std::find_if(kwargs.cbegin(), kwargs.cend(),
-                          [key](auto const& kv) { return kv.first == key; });
+                          [key](auto const& kv) { return kv.first == key; }) != kwargs.cend();
     };
-    auto it = find_key("base_score");
-    if (it != kwargs.cend()) {
-      boost_from_average = false;
+    if (has_key("base_score")) {
+      this->boost_from_average = false;
     }
     return dmlc::Parameter<LearnerModelParamLegacy>::UpdateAllowUnknown(kwargs);
   }
+  // The number of outputs of the model.
+  [[nodiscard]] bst_target_t OutputLength() const noexcept {
+    return std::max({this->num_target, static_cast<bst_target_t>(this->num_class),
+                     static_cast<bst_target_t>(1)});
+  }
+
   // Sanity checks
-  void Validate(Context const* ctx) {
-    CHECK_GE(this->base_score.size(), 1);
+  void Validate(Context const* ctx) const {
+    this->ValidateLength();
+    CHECK(std::none_of(base_score.cbegin(), base_score.cend(),
+                       [](float v) { return std::isnan(v) || std::isinf(v); }));
+
     if (!collective::IsDistributed()) {
       return;
     }
@@ -183,12 +198,25 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     collective::SafeColl(rc);
 
     CHECK(std::equal(data.cbegin(), data.cend(), sync.cbegin()))
-        << "Different model parameter across workers.";
+        << "Different model parameter across workers:\n\t"
+        << Json::Load(StringView{data.data(), data.size()}, std::ios::binary) << "\nvs.\n\t"
+        << Json::Load(StringView{sync.data(), sync.size()}, std::ios::binary);
+  }
+
+  void ValidateLength() const {
+    CHECK_GE(this->base_score.size(), 1);
+    std::size_t n_classes = static_cast<std::size_t>(num_class),
+                n_targets = static_cast<std::size_t>(num_target);
+    if (!(base_score.size() == n_classes || base_score.size() == n_targets)) {
+      error::InvalidIntercept(n_classes, n_targets, base_score.size());
+    }
   }
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(LearnerModelParamLegacy) {
-    DMLC_DECLARE_FIELD(base_score).describe("Global bias of the model.");
+    DMLC_DECLARE_FIELD(base_score)
+        .describe("Global bias of the model.")
+        .set_default(common::ParamArray<float>{"base_score"});
     DMLC_DECLARE_FIELD(num_feature)
         .set_default(0)
         .describe(
@@ -206,12 +234,13 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
         .describe("Whether we should calculate the base score from training data.");
   }
 };
+}  // namespace xgboost
 
+namespace xgboost {
 LearnerModelParam::LearnerModelParam(LearnerModelParamLegacy const& user_param, ObjInfo t,
                                      MultiStrategy multi_strategy)
     : num_feature{user_param.num_feature},
-      num_output_group{
-          std::max(static_cast<std::uint32_t>(user_param.num_class), user_param.num_target)},
+      num_output_group{user_param.OutputLength()},
       task{t},
       multi_strategy{multi_strategy} {
   if (user_param.num_class > 1 && user_param.num_target > 1) {
@@ -235,7 +264,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
 
 linalg::VectorView<float const> LearnerModelParam::BaseScore(DeviceOrd device) const {
   // multi-class is not yet supported.
-  CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
+  CHECK_GE(base_score_.Size(), 1) << ModelNotFitted();
   if (!device.IsCUDA()) {
     // Make sure that we won't run into race condition.
     CHECK(base_score_.Data()->HostCanRead());
@@ -306,79 +335,165 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
     dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
 
-class LearnerConfiguration : public Learner {
- private:
-  std::mutex config_lock_;
-
- protected:
-  static std::string const kEvalMetric;  // NOLINT
+namespace {
+/**
+ * @brief Handler for the `n_targets` property and the intercept.
+ */
+class Intercept : public Learner {
+  using CacheT = common::GetValueT<decltype(std::declval<PredictionContainer>().Container())>;
 
  protected:
-  std::atomic<bool> need_configuration_;
-  std::map<std::string, std::string> cfg_;
-  // Stores information like best-iteration for early stopping.
-  std::map<std::string, std::string> attributes_;
-  // Name of each feature, usually set from DMatrix.
-  std::vector<std::string> feature_names_;
-  // Type of each feature, usually set from DMatrix.
-  std::vector<std::string> feature_types_;
-
-  common::Monitor monitor_;
+  /**
+   * @brief User-provided model parameter.
+   *
+   * This parameter is the most difficult one in XGBoost. It stores basic properties of
+   * the booster model and is saved as part of the booster. We need to configure it
+   * automatically from input training data while taking user-provided parameters into
+   * account.
+   *
+   * It's difficult because XGBoost has an interface that exposes many states. For
+   * instance, we need to have a valid model after configuration, without seeing the
+   * training data. This exposes a partially initialized model that's semi-valid.
+   */
   LearnerModelParamLegacy mparam_;
+  /**
+   * @brief Internal model parameter.
+   */
   LearnerModelParam learner_model_param_;
-  LearnerTrainParam tparam_;
-  // Initial prediction.
-  PredictionContainer prediction_container_;
 
-  std::vector<std::string> metric_names_;
+ private:
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) {
+    base_score->SetDevice(this->Ctx()->Device());
+    base_score->Reshape(this->mparam_.OutputLength());
+    collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(),
+                                [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
+  }
 
-  void ConfigureModelParamWithoutBaseScore() {
-    // Convert mparam to learner_model_param
-    this->ConfigureTargets();
+  [[nodiscard]] bool NeedFit() const {
+    return this->mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted();
+  }
 
-    auto task = UsePtr(obj_)->Task();
-    linalg::Vector<float> base_score({1}, Ctx()->Device());
-    auto h_base_score = base_score.HostView();
+  // Create the internal model parameter from user inputs, this requires the user input to
+  // be initialized first.
+  //
+  // Don't apply the link function if the base_score is a dummy value.
+  //
+  // This function should be called for every `Configure` call ot make sure the base_score
+  // is stored in the right place.
+  void InitModelParam(LearnerTrainParam const& tparam, bool apply_link) {
+    auto const& in = this->mparam_.base_score;
+    auto task = UsePtr(this->obj_)->Task();
+    linalg::Vector<float> base_score{in.cbegin(), in.cend(), {in.size()}, this->ctx_.Device()};
+    if (apply_link) {
+      UsePtr(this->obj_)->ProbToMargin(&base_score);
+    }
 
-    // Transform to margin. (apply the link function)
-    CHECK(!this->mparam_.base_score.empty());
-    h_base_score(0) = obj_->ProbToMargin(mparam_.base_score[0]);
-    CHECK(tparam_.GetInitialised());
-    // move it to model param, which is shared with all other components.
     learner_model_param_ =
-        LearnerModelParam{Ctx(), mparam_, std::move(base_score), task, tparam_.multi_strategy};
-    CHECK(learner_model_param_.Initialized());
-    CHECK_NE(learner_model_param_.BaseScore(Ctx()).Size(), 0);
+        LearnerModelParam{Ctx(), mparam_, std::move(base_score), task, tparam.multi_strategy};
+  }
+
+  /**
+   * Get the number of targets from the cache using the objective function.
+   */
+  void GetNumTargets(CacheT const& cache) {
+    CHECK(this->obj_);
+    bst_target_t n_targets = 1;
+    for (auto const& d : cache) {
+      if (n_targets == 1) {
+        n_targets = this->obj_->Targets(d.first.ptr->Info());
+      } else {
+        auto t = this->obj_->Targets(d.first.ptr->Info());
+        CHECK(n_targets == t || 1 == t) << "Inconsistent labels.";
+      }
+    }
+
+    if (mparam_.num_target > 1) {
+      CHECK(n_targets == 1 || n_targets == mparam_.num_target)
+          << "Inconsistent configuration of the `num_target`.  Configuration result from input "
+          << "data:" << n_targets << ", configuration from parameters:" << mparam_.num_target;
+    } else {
+      mparam_.num_target = n_targets;
+    }
+  }
+
+ protected:
+  void CheckModelInitialized() const {
+    CHECK(learner_model_param_.Initialized()) << ModelNotFitted();
+    CHECK_NE(learner_model_param_.BaseScore(this->Ctx()).Size(), 0) << ModelNotFitted();
+  }
+
+  void InitModelUserParam(LearnerTrainParam const& tparam, CacheT const& cache) {
+    this->GetNumTargets(cache);
+
+    if (this->NeedFit()) {
+      // Initialize with a sensible default value to get prediction/model io going.
+      this->mparam_.base_score.Resize(this->mparam_.OutputLength(),
+                                      ObjFunction::DefaultBaseScore());
+      this->InitModelParam(tparam, false);
+      // This should not be altered, we will estimate it later.
+      CHECK(this->NeedFit());
+    } else if (this->gbm_->ModelFitted()) {
+      this->mparam_.ValidateLength();
+      // Init with a valid (configured) mparam
+      this->InitModelParam(tparam, true);
+    } else {
+      // user-provided
+      this->mparam_.HandleOldFormat();
+      this->InitModelParam(tparam, true);
+    }
   }
+
   /**
    * @brief Calculate the `base_score` based on input data.
    *
    * @param p_fmat The training DMatrix used to estimate the base score.
    */
-  void InitBaseScore(DMatrix const* p_fmat) {
-    if (!learner_model_param_.Initialized()) {
-      this->ConfigureModelParamWithoutBaseScore();
-    }
-
-    if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
-      // The DMatrix can be null if a method that's not training is called.
+  void FitIntercept(LearnerTrainParam const& tparam, DMatrix const* p_fmat) {
+    // Estimate the intercept if this is the first iteration.
+    if (this->NeedFit()) {
+      // The DMatrix can be null if a method other than training is called.
       if (p_fmat) {
         auto const& info = p_fmat->Info();
         info.Validate(Ctx()->Device());
-        // We estimate it from input data.
+        // We estimate it from the input data.
         linalg::Vector<float> base_score;
         this->InitEstimation(info, &base_score);
-        CHECK_EQ(base_score.Size(), 1);
+
         mparam_.base_score = base_score.Data()->ConstHostVector();
-        CHECK(!std::isnan(mparam_.base_score[0]));
       }
-      // Update the shared model parameter
-      this->ConfigureModelParamWithoutBaseScore();
+      this->InitModelParam(tparam, true);
+      // Check whether the base score is valid.
       mparam_.Validate(&ctx_);
     }
-    CHECK(!mparam_.base_score.empty() && !std::isnan(mparam_.base_score[0]));
-    CHECK(!std::isinf(mparam_.base_score[0]));
+
+    this->CheckModelInitialized();
   }
+};
+}  // namespace
+
+class LearnerConfiguration : public Intercept {
+ private:
+  std::mutex config_lock_;
+
+ protected:
+  static std::string const kEvalMetric;  // NOLINT
+
+ protected:
+  std::atomic<bool> need_configuration_;
+  std::map<std::string, std::string> cfg_;
+  // Stores information like best-iteration for early stopping.
+  std::map<std::string, std::string> attributes_;
+  // Name of each feature, usually set from DMatrix.
+  std::vector<std::string> feature_names_;
+  // Type of each feature, usually set from DMatrix.
+  std::vector<std::string> feature_types_;
+
+  common::Monitor monitor_;
+  LearnerTrainParam tparam_;
+  // Initial prediction.
+  PredictionContainer prediction_container_;
+
+  std::vector<std::string> metric_names_;
 
  public:
   explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix>> cache)
@@ -428,7 +543,7 @@ class LearnerConfiguration : public Learner {
     learner_model_param_.task = obj_->Task();  // required by gbm configuration.
     this->ConfigureGBM(old_tparam, args);
 
-    this->ConfigureModelParamWithoutBaseScore();
+    this->InitModelUserParam(this->tparam_, this->prediction_container_.Container());
 
     this->ConfigureMetrics(args);
 
@@ -441,11 +556,6 @@ class LearnerConfiguration : public Learner {
     monitor_.Stop("Configure");
   }
 
-  void CheckModelInitialized() const {
-    CHECK(learner_model_param_.Initialized()) << ModelNotFitted();
-    CHECK_NE(learner_model_param_.BaseScore(this->Ctx()).Size(), 0) << ModelNotFitted();
-  }
-
   void LoadConfig(Json const& in) override {
     // If configuration is loaded, ensure that the model came from the same version
     CHECK(IsA<Object>(in));
@@ -764,7 +874,7 @@ class LearnerConfiguration : public Learner {
    */
   void ConfigureTargets() {
     CHECK(this->obj_);
-    auto const& cache = prediction_container_.Container();
+    auto const& cache = this->prediction_container_.Container();
     bst_target_t n_targets = 1;
     for (auto const& d : cache) {
       if (n_targets == 1) {
@@ -777,15 +887,16 @@ class LearnerConfiguration : public Learner {
 
     if (mparam_.num_target > 1) {
       CHECK(n_targets == 1 || n_targets == mparam_.num_target)
-          << "Inconsistent configuration of num_target.  Configuration result from input data:"
-          << n_targets << ", configuration from parameter:" << mparam_.num_target;
+          << "Inconsistent configuration of the `num_target`.  Configuration result from input "
+          << "data:" << n_targets << ", configuration from parameters:" << mparam_.num_target;
     } else {
       mparam_.num_target = n_targets;
     }
   }
 
   void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) {
-    base_score->Reshape(1);
+    base_score->SetDevice(this->Ctx()->Device());
+    base_score->Reshape(this->mparam_.OutputLength());
     collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(),
                                 [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
   }
@@ -1015,7 +1126,7 @@ class LearnerImpl : public LearnerIO {
     monitor_.Start("UpdateOneIter");
     TrainingObserver::Instance().Update(iter);
     this->Configure();
-    this->InitBaseScore(train.get());
+    this->FitIntercept(this->tparam_, train.get());
 
     if (ctx_.seed_per_iteration) {
       common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter);
@@ -1106,7 +1217,7 @@ class LearnerImpl : public LearnerIO {
                                static_cast<int>(pred_contribs);
     this->Configure();
     if (training) {
-      this->InitBaseScore(nullptr);
+      this->FitIntercept(this->tparam_, nullptr);
     }
     this->CheckModelInitialized();
 
diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu
index 3ad9ca847db7..7c69af1e9f5c 100644
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -1,24 +1,29 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2025, XGBoost Contributors
  * \file aft_obj.cu
  * \brief Definition of AFT loss for survival analysis.
  * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
  */
 
-#include <vector>
-#include <limits>
-#include <memory>
-#include <utility>
+#include <cmath>    // for log
+#include <cstddef>  // for size_t
 
+
+#include "../common/survival_util.h"
+#include "../common/transform.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
-#include "xgboost/parameter.h"
-#include "xgboost/span.h"
 #include "xgboost/logging.h"
 #include "xgboost/objective.h"
+#include "xgboost/span.h"
 
-#include "../common/transform.h"
-#include "../common/survival_util.h"
+#if defined(XGBOOST_USE_CUDA)
+#include "../common/linalg_op.cuh"  // for ElementWiseKernel
+#elif defined(XGBOOST_USE_SYCL)
+#include "../../plugin/sycl/common/linalg_op.h"
+#else
+#include "../common/linalg_op.h"  // for ElementWiseKernel
+#endif
 
 using AFTParam = xgboost::common::AFTParam;
 using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType;
@@ -116,8 +121,11 @@ class AFTObj : public ObjFunction {
     // do nothing here, since the AFT metric expects untransformed prediction score
   }
 
-  bst_float ProbToMargin(bst_float base_score) const override {
-    return std::log(base_score);
+  void ProbToMargin(linalg::Vector<float>* base_score) const override {
+    auto intercept = base_score->View(this->ctx_->Device());
+    linalg::ElementWiseKernel(ctx_, intercept, [=] XGBOOST_DEVICE(std::size_t i) mutable {
+      intercept(i) = std::log(intercept(i));
+    });
   }
 
   const char* DefaultEvalMetric() const override {
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
index 259e5ceee30a..f94d2f8ba286 100644
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -32,11 +32,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
   new_obj->GetGradient(dummy_predt, info, 0, &gpair);
 
   bst_target_t n_targets = this->Targets(info);
-  linalg::Vector<float> leaf_weight;
-  tree::FitStump(this->ctx_, info, gpair, n_targets, &leaf_weight);
-  // Workaround, we don't support multi-target due to binary model serialization for
-  // base margin.
-  common::Mean(this->ctx_, leaf_weight, base_score);
+  tree::FitStump(this->ctx_, info, gpair, n_targets, base_score);
   this->PredTransform(base_score->Data());
 }
 
@@ -45,13 +41,12 @@ void FitInterceptGlmLike::InitEstimation(MetaInfo const& info,
   if (this->Task().task == ObjInfo::kRegression) {
     CheckInitInputs(info);
   }
-  linalg::Vector<float> out;
   if (info.weights_.Empty()) {
-    common::SampleMean(this->ctx_, info.IsColumnSplit(), info.labels, &out);
+    common::SampleMean(this->ctx_, info.IsColumnSplit(), info.labels, base_score);
   } else {
-    common::WeightedSampleMean(this->ctx_, info.IsColumnSplit(), info.labels, info.weights_, &out);
+    common::WeightedSampleMean(this->ctx_, info.IsColumnSplit(), info.labels, info.weights_,
+                               base_score);
   }
-  common::Mean(this->ctx_, out, base_score);
-  CHECK_EQ(base_score->Size(), 1);
+  CHECK_GE(base_score->Size(), 1);
 }
 }  // namespace xgboost::obj
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 48f3094547d9..86bf603f2618 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -9,7 +9,8 @@
 #include <cassert>  // for assert
 #include <limits>
 
-#include "../common/common.h"  // for AssertGPUSupport
+#include "../collective/aggregator.h"  // for GlobalSum
+#include "../common/common.h"          // for AssertGPUSupport
 #include "../common/linalg_op.h"
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // for MakeOptionalWeights
@@ -41,7 +42,7 @@ DMLC_REGISTRY_FILE_TAG(multiclass_obj_gpu);
 namespace {
 void ValidateLabel(Context const* ctx, MetaInfo const& info, std::int64_t n_classes) {
   auto label = info.labels.View(ctx->Device());
-  CHECK_EQ(label.Shape(1), 1) << "multi-class-multi-label is not yet supported.";
+  CHECK_LE(label.Shape(1), 1) << "multi-class-multi-label is not yet supported.";
   auto check = [=] XGBOOST_DEVICE(float y) -> bool {
     return y >= 0 && y < n_classes && std::floor(y) == y;
   };
@@ -186,6 +187,27 @@ class SoftmaxMultiClassObj : public ObjFunction {
 
   void LoadConfig(Json const& in) override { FromJson(in["softmax_multiclass_param"], &param_); }
 
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override {
+    std::int64_t n_classes = this->param_.num_class;
+    ValidateLabel(this->ctx_, info, n_classes);
+
+    *base_score = linalg::Zeros<float>(this->ctx_, n_classes);
+
+    std::size_t n = info.labels.Size();
+
+    auto labels = info.labels.View(ctx_->Device());
+    auto weights = common::MakeOptionalWeights(this->ctx_, info.weights_);
+    auto intercept = base_score->View(ctx_->Device());
+    CHECK_EQ(intercept.Size(), n_classes);
+    CHECK_EQ(n, info.num_row_);
+    linalg::SmallHistogram(ctx_, labels, weights, intercept);
+    auto sum_weight = common::SumOptionalWeights(this->ctx_, weights, n);
+    auto status = collective::GlobalSum(this->ctx_, info, intercept, &sum_weight);
+    collective::SafeColl(status);
+    CHECK_GE(sum_weight, kRtEps);
+    linalg::VecScaDiv(this->ctx_, intercept, sum_weight);
+  }
+
  private:
   // output probability
   bool const output_prob_;
diff --git a/src/objective/objective.cc b/src/objective/objective.cc
index cc54d14c23cc..8731394dfc25 100644
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -1,15 +1,14 @@
-/*!
- * Copyright 2015-2022 by Contributors
- * \file objective.cc
- * \brief Registry of all objective functions.
+/**
+ * Copyright 2015-2025, XGBoost Contributors
+ *
+ * @brief Registry of all objective functions.
  */
 #include <dmlc/registry.h>
 #include <xgboost/context.h>
 #include <xgboost/objective.h>
 
-#include <sstream>
-
-#include "xgboost/host_device_vector.h"
+#include <sstream>  // for stringstream
+#include <string>   // for string
 
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg);
@@ -33,10 +32,10 @@ ObjFunction* ObjFunction::Create(const std::string& name, Context const* ctx) {
   return pobj;
 }
 
-void ObjFunction::InitEstimation(MetaInfo const&, linalg::Tensor<float, 1>* base_score) const {
+void ObjFunction::InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const {
   CHECK(base_score);
-  base_score->Reshape(1);
-  (*base_score)(0) = DefaultBaseScore();
+  auto n_targets = this->Targets(info);
+  *base_score = linalg::Constant(this->ctx_, DefaultBaseScore(), n_targets);
 }
 }  // namespace xgboost
 
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 06f33b6c5cfc..34a82e808310 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -105,7 +105,6 @@ class QuantileRegression : public ObjFunction {
     base_score->SetDevice(ctx_->Device());
     base_score->Reshape(n_targets);
 
-    double sw{0};
     if (ctx_->IsCUDA()) {
 #if defined(XGBOOST_USE_CUDA)
       alpha_.SetDevice(ctx_->Device());
@@ -125,7 +124,6 @@ class QuantileRegression : public ObjFunction {
       if (info.weights_.Empty()) {
         common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
                                   val_it + n, base_score->Data());
-        sw = info.num_row_;
       } else {
         info.weights_.SetDevice(ctx_->Device());
         auto d_weights = info.weights_.ConstDeviceSpan();
@@ -137,8 +135,6 @@ class QuantileRegression : public ObjFunction {
         common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
                                           val_it, val_it + n, weight_it, weight_it + n,
                                           base_score->Data());
-        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
-                        thrust::plus<double>{});
       }
 #else
       common::AssertGPUSupport();
@@ -146,11 +142,6 @@ class QuantileRegression : public ObjFunction {
     } else {
       auto quantiles = base_score->HostView();
       auto h_weights = info.weights_.ConstHostVector();
-      if (info.weights_.Empty()) {
-        sw = info.num_row_;
-      } else {
-        sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0);
-      }
       for (bst_target_t t{0}; t < n_targets; ++t) {
         auto alpha = param_.quantile_alpha[t];
         auto h_labels = info.labels.HostView();
@@ -165,20 +156,13 @@ class QuantileRegression : public ObjFunction {
       }
     }
 
-    // For multiple quantiles, we should extend the base score to a vector instead of
-    // computing the average. For now, this is a workaround.
-    linalg::Vector<float> temp;
-    common::Mean(ctx_, *base_score, &temp);
-    double meanq = temp(0) * sw;
-
-    std::array<double, 2> dat{meanq, sw};
-    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
-    collective::SafeColl(rc);
-
-    std::tie(meanq, sw) = std::tuple_cat(dat);
-    meanq /= (sw + kRtEps);
-    base_score->Reshape(1);
-    base_score->Data()->Fill(meanq);
+    // Global mean. There's no strong preference on whether weighted mean should be used
+    // with weighted quantiles. The proper way to do this might be using an approximated
+    // quantile algorithm with stream inputs, but it's also much more expensive.
+    auto intercept = base_score->View(this->ctx_->Device());
+    collective::SafeColl(collective::GlobalSum(ctx_, info, intercept));
+    double n_workers = info.IsColumnSplit() ? 1.0 : collective::GetWorldSize();
+    linalg::VecScaDiv(ctx_, intercept, n_workers);
   }
 
   void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
diff --git a/src/objective/regression_loss.h b/src/objective/regression_loss.h
index 417f06ad252d..c65d57d6e946 100644
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -4,12 +4,10 @@
 #ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 #define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 
-#include <dmlc/omp.h>
-
 #include <cmath>
 
 #include "../common/math.h"
-#include "xgboost/logging.h"
+#include "xgboost/string_view.h"
 #include "xgboost/task.h"  // ObjInfo
 
 namespace xgboost::obj {
@@ -21,7 +19,11 @@ struct LinearSquareLoss {
     return predt - label;
   }
   XGBOOST_DEVICE static bst_float SecondOrderGradient(bst_float, bst_float) { return 1.0f; }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+
+  XGBOOST_DEVICE static float ProbToMargin(float base_score) { return base_score; }
+  constexpr static StringView InterceptErrorMsg() { return ""; }
+  XGBOOST_DEVICE static bool CheckIntercept(float) { return true; }
+
   static const char* LabelErrorMsg() { return ""; }
   static const char* DefaultEvalMetric() { return "rmse"; }
 
@@ -42,7 +44,11 @@ struct SquaredLogError {
     res = fmaxf(res, 1e-6f);
     return res;
   }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+
+  XGBOOST_DEVICE static float ProbToMargin(float base_score) { return base_score; }
+  constexpr static StringView InterceptErrorMsg() { return ""; }
+  XGBOOST_DEVICE static bool CheckIntercept(float) { return true; }
+
   static const char* LabelErrorMsg() {
     return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
   }
@@ -64,11 +70,17 @@ struct LogisticRegression {
     const float eps = 1e-16f;
     return fmaxf(predt * (1.0f - predt), eps);
   }
-  static bst_float ProbToMargin(bst_float base_score) {
-    CHECK(base_score > 0.0f && base_score < 1.0f)
-        << "base_score must be in (0,1) for logistic loss, got: " << base_score;
+
+  XGBOOST_DEVICE static float ProbToMargin(float base_score) {
     return -logf(1.0f / base_score - 1.0f);
   }
+  constexpr static StringView InterceptErrorMsg() {
+    return "base_score must be in (0,1) for the logistic loss.";
+  }
+  XGBOOST_DEVICE static bool CheckIntercept(float base_score) {
+    return base_score > 0.0f && base_score < 1.0f;
+  }
+
   static const char* LabelErrorMsg() { return "label must be in [0,1] for logistic regression"; }
   static const char* DefaultEvalMetric() { return "rmse"; }
 
@@ -96,7 +108,11 @@ struct LogisticRaw : public LogisticRegression {
     predt = common::Sigmoid(predt);
     return fmaxf(predt * (1.0f - predt), eps);
   }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+
+  XGBOOST_DEVICE static float ProbToMargin(float base_score) { return base_score; }
+  constexpr static StringView InterceptErrorMsg() { return ""; }
+  XGBOOST_DEVICE static bool CheckIntercept(float) { return true; }
+
   static const char* DefaultEvalMetric() { return "logloss"; }
 
   static const char* Name() { return "binary:logitraw"; }
@@ -108,7 +124,13 @@ struct LogisticRaw : public LogisticRegression {
 class GammaDeviance {
  public:
   XGBOOST_DEVICE static float PredTransform(float x) { return std::exp(x); }
+
   XGBOOST_DEVICE static float ProbToMargin(float x) { return std::log(x); }
+  constexpr static StringView InterceptErrorMsg() {
+    return "`base_score` must be greater than 0 for gamma regression";
+  }
+  XGBOOST_DEVICE static bool CheckIntercept(float base_score) { return base_score > 0; }
+
   XGBOOST_DEVICE static float FirstOrderGradient(float p, float y) {
     return 1.0f - y / p;
   }
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index bed6e63da5d9..eb60c98d90c3 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -19,6 +19,7 @@
 #include "../common/stats.h"
 #include "../common/threading_utils.h"
 #include "../common/transform.h"
+#include "../common/utils.h"  // for NoOp
 #include "./regression_loss.h"
 #include "adaptive.h"
 #include "init_estimation.h"  // FitIntercept
@@ -86,6 +87,31 @@ void ValidateLabel(Context const* ctx, MetaInfo const& info) {
   if (!valid) {
     LOG(FATAL) << Loss::LabelErrorMsg();
   }
+  if (!info.weights_.Empty()) {
+    CHECK_EQ(info.weights_.Size(), info.num_row_)
+        << "Number of weights should be equal to the number of data points.";
+  }
+}
+
+template <typename Fn, typename Chk = common::NoOp<bool>, typename Err = common::NoOp<StringView>>
+void ProbToMarginImpl(Context const* ctx, linalg::Vector<float>* base_score, Fn&& fn,
+                      Chk check = common::NoOp{true}, Err error = common::NoOp<StringView>{{}}) {
+  auto intercept = base_score->View(ctx->Device());
+  bool is_valid = ctx->DispatchDevice(
+      [&] { return std::all_of(linalg::cbegin(intercept), linalg::cend(intercept), check); },
+      [&] {
+#if defined(XGBOOST_USE_CUDA)
+        return common::AllOf(ctx->CUDACtx()->CTP(), linalg::tcbegin(intercept),
+                             linalg::tcend(intercept), check);
+#else
+        common::AssertGPUSupport();
+        return false;
+#endif  // defined(XGBOOST_USE_CUDA)
+      });
+  CHECK(is_valid) << error();
+  linalg::ElementWiseKernel(ctx, intercept, [=] XGBOOST_DEVICE(std::size_t i) mutable {
+    intercept(i) = fn(intercept(i));
+  });
 }
 }  // anonymous namespace
 
@@ -111,8 +137,8 @@ class RegLossObj : public FitInterceptGlmLike {
     return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
   }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
-                   std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override {
+  void GetGradient(const HostDeviceVector<float>& preds, const MetaInfo& info, std::int32_t iter,
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     CheckRegInputs(info, preds);
     if (iter == 0) {
       ValidateLabel<Loss>(this->ctx_, info);
@@ -193,8 +219,10 @@ class RegLossObj : public FitInterceptGlmLike {
     }
   }
 
-  [[nodiscard]] float ProbToMargin(float base_score) const override {
-    return Loss::ProbToMargin(base_score);
+  void ProbToMargin(linalg::Vector<float>* base_score) const override {
+    ProbToMarginImpl(
+        this->ctx_, base_score, [] XGBOOST_DEVICE(float v) { return Loss::ProbToMargin(v); },
+        [] XGBOOST_DEVICE(float v) { return Loss::CheckIntercept(v); }, Loss::InterceptErrorMsg);
   }
 
   void SaveConfig(Json* p_out) const override {
@@ -440,8 +468,8 @@ class PoissonRegression : public FitInterceptGlmLike {
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
-  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
-    return std::log(base_score);
+  void ProbToMargin(linalg::Vector<float>* base_score) const override {
+    ProbToMarginImpl(this->ctx_, base_score, [] XGBOOST_DEVICE(float v) { return std::log(v); });
   }
   [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "poisson-nloglik";
@@ -547,8 +575,8 @@ class CoxRegression : public FitIntercept {
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
-  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
-    return std::log(base_score);
+  void ProbToMargin(linalg::Vector<float>* base_score) const override {
+    ProbToMarginImpl(this->ctx_, base_score, [] XGBOOST_DEVICE(float v) { return std::log(v); });
   }
   [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "cox-nloglik";
@@ -647,9 +675,8 @@ class TweedieRegression : public FitInterceptGlmLike {
         io_preds->Device())
         .Eval(io_preds);
   }
-
-  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
-    return std::log(base_score);
+  void ProbToMargin(linalg::Vector<float>* base_score) const override {
+    ProbToMarginImpl(this->ctx_, base_score, [] XGBOOST_DEVICE(float v) { return std::log(v); });
   }
 
   [[nodiscard]] const char* DefaultEvalMetric() const override {
@@ -713,46 +740,38 @@ class MeanAbsoluteError : public ObjFunction {
         });
   }
 
-  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_margin) const override {
+  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) const override {
     CheckInitInputs(info);
-    base_margin->Reshape(this->Targets(info));
+    base_score->Reshape(this->Targets(info));
 
-    double w{0.0};
+    double sum_weight{0.0};
     if (info.weights_.Empty()) {
-      w = static_cast<double>(info.num_row_);
+      sum_weight = static_cast<double>(info.num_row_);
     } else {
-      w = common::Reduce(ctx_, info.weights_);
+      sum_weight = common::Reduce(ctx_, info.weights_);
     }
 
     if (info.num_row_ == 0) {
-      auto out = base_margin->HostView();
-      out(0) = 0;
+      auto out = base_score->HostView();
+      std::fill(linalg::begin(out), linalg::end(out), 0.0f);
     } else {
-      linalg::Vector<float> temp;
-      common::Median(ctx_, info.labels, info.weights_, &temp);
-      common::Mean(ctx_, temp, base_margin);
+      common::Median(ctx_, info.labels, info.weights_, base_score);
     }
-    CHECK_EQ(base_margin->Size(), 1);
-    auto out = base_margin->HostView();
+
+    auto intercept = base_score->View(this->ctx_->Device());
     // weighted avg
-    std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
-                   [w](float v) { return v * w; });
-
-    auto rc = collective::Success() << [&] {
-      return collective::GlobalSum(ctx_, info, out);
-    } << [&] {
-      return collective::GlobalSum(ctx_, info, linalg::MakeVec(&w, 1));
-    };
+    linalg::VecScaMul(this->ctx_, intercept, sum_weight);
+    auto rc = collective::GlobalSum(ctx_, info, intercept, &sum_weight);
     collective::SafeColl(rc);
 
-    if (common::CloseTo(w, 0.0)) {
+    if (common::CloseTo(sum_weight, 0.0)) {
       // Mostly for handling empty dataset test.
       LOG(WARNING) << "Sum of weights is close to 0.0, skipping base score estimation.";
-      out(0) = ObjFunction::DefaultBaseScore();
+      *base_score = linalg::Zeros<float>(ctx_, base_score->Shape(0));
       return;
     }
-    std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
-                   [w](float v) { return v / w; });
+
+    linalg::VecScaDiv(this->ctx_, intercept, sum_weight);
   }
 
   void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 2a6d1b9c58db..31aa04730a72 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -1,15 +1,15 @@
 /**
- * Copyright 2017-2024 by Contributors
+ * Copyright 2017-2025, XGBoost Contributors
  */
 #include "xgboost/predictor.h"
 
-#include <dmlc/registry.h>               // for DMLC_REGISTRY_LINK_TAG
+#include <dmlc/registry.h>  // for DMLC_REGISTRY_LINK_TAG
 
-#include <cstdint>                       // for int32_t
-#include <string>                        // for string, to_string
+#include <cstdint>  // for int32_t
+#include <string>   // for string, to_string
 
 #include "../gbm/gbtree_model.h"         // for GBTreeModel
-#include "xgboost/base.h"                // for bst_float, Args, bst_group_t, bst_idx_t
+#include "xgboost/base.h"                // for Args, bst_group_t, bst_idx_t
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
@@ -36,32 +36,62 @@ Predictor* Predictor::Create(std::string const& name, Context const* ctx) {
 template <int32_t D>
 void ValidateBaseMarginShape(linalg::Tensor<float, D> const& margin, bst_idx_t n_samples,
                              bst_group_t n_groups) {
-  // FIXME: Bindings other than Python doesn't have shape.
+  // FIXME: Bindings other than Python and R don't have shape.
   std::string expected{"Invalid shape of base_margin. Expected: (" + std::to_string(n_samples) +
                        ", " + std::to_string(n_groups) + ")"};
   CHECK_EQ(margin.Shape(0), n_samples) << expected;
   CHECK_EQ(margin.Shape(1), n_groups) << expected;
 }
 
-void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_preds,
-                                   const gbm::GBTreeModel& model) const {
+namespace cuda_impl {
+void InitOutPredictions(Context const* ctx, linalg::VectorView<float const> base_score,
+                        linalg::MatrixView<float> predt);
+}
+
+void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<float>* out_preds,
+                                   gbm::GBTreeModel const& model) const {
   CHECK_NE(model.learner_model_param->num_output_group, 0);
-  auto n = static_cast<size_t>(model.learner_model_param->OutputLength() * info.num_row_);
 
-  const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
   if (ctx_->Device().IsCUDA()) {
     out_preds->SetDevice(ctx_->Device());
   }
+
+  // Cannot rely on the Resize to fill as it might skip if the size is already correct.
+  auto n = static_cast<size_t>(model.learner_model_param->OutputLength() * info.num_row_);
+  out_preds->Resize(n);
+
+  HostDeviceVector<float> const* base_margin = info.base_margin_.Data();
   if (!base_margin->Empty()) {
-    out_preds->Resize(n);
     ValidateBaseMarginShape(info.base_margin_, info.num_row_,
                             model.learner_model_param->OutputLength());
     out_preds->Copy(*base_margin);
+    return;
+  }
+
+  auto base_score = model.learner_model_param->BaseScore(this->ctx_->Device());
+  if (base_score.Size() == 1) {
+    // Fill a scalar
+    out_preds->Fill(model.learner_model_param->BaseScore(DeviceOrd::CPU())(0));
+    return;
+  }
+
+  // Handle multi-output models where base_score is a vector.
+  auto predt = linalg::MakeTensorView(this->ctx_, out_preds, info.num_row_,
+                                      model.learner_model_param->OutputLength());
+  CHECK_EQ(predt.Size(), out_preds->Size());
+
+  if (this->ctx_->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    cuda_impl::InitOutPredictions(this->ctx_, base_score, predt);
+#else
+    common::AssertGPUSupport();
+#endif
   } else {
-    // cannot rely on the Resize to fill as it might skip if the size is already correct.
-    out_preds->Resize(n);
-    auto base_score = model.learner_model_param->BaseScore(DeviceOrd::CPU())(0);
-    out_preds->Fill(base_score);
+    common::ParallelFor(info.num_row_, this->ctx_->Threads(), [&](auto i) {
+      for (std::size_t j = 0, m = predt.Shape(1); j < m; ++j) {
+        predt(i, j) = base_score(j);
+      }
+    });
   }
 }
 }  // namespace xgboost
diff --git a/src/predictor/predictor.cu b/src/predictor/predictor.cu
new file mode 100644
index 000000000000..285ecf2b7618
--- /dev/null
+++ b/src/predictor/predictor.cu
@@ -0,0 +1,19 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <thrust/for_each.h>                    // for for_each_n
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+
+#include "../common/cuda_context.cuh"
+#include "xgboost/linalg.h"  // for UnravelIndex
+
+namespace xgboost::cuda_impl {
+void InitOutPredictions(Context const* ctx, linalg::VectorView<float const> base_score,
+                        linalg::MatrixView<float> predt) {
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), predt.Size(),
+                     [=] XGBOOST_DEVICE(std::size_t k) mutable {
+                       auto [i, j] = linalg::UnravelIndex(k, predt.Shape());
+                       predt(i, j) = base_score(j);
+                     });
+}
+}  // namespace xgboost::cuda_impl
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index 6ba398bdeb47..9e7d9690ed8b 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>                       // for equal
@@ -8,7 +8,10 @@
 
 #include "../../../src/common/cuda_context.cuh"
 #include "../../../src/common/linalg_op.cuh"
+#include "../../../src/common/optional_weight.h"  // for MakeOptionalWeights
 #include "../helpers.h"
+#include "thrust/random.h"   // for default_random_engine
+#include "thrust/shuffle.h"  // for shuffle
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"
 
@@ -118,4 +121,31 @@ TEST(Linalg, GPUIter) {
 
   TestWriteAccess(cuctx, t);
 }
+
+TEST(Linalg, SmallHistogram) {
+  auto ctx = MakeCUDACtx(0);
+  // Generate random data with 4 bins and 32 elements for each bin.
+  std::size_t cnt = 32, n_bins = 4;
+  dh::device_vector<float> values(cnt * n_bins);
+  for (std::size_t i = 0; i < n_bins; ++i) {
+    thrust::fill_n(ctx.CUDACtx()->CTP(), values.begin() + i * cnt, cnt, i);
+  }
+  thrust::default_random_engine rng;
+  rng.seed(2025);
+  thrust::shuffle(ctx.CUDACtx()->CTP(), values.begin(), values.end(), rng);
+
+  linalg::MatrixView<float> indices =
+      linalg::MakeTensorView(&ctx, dh::ToSpan(values), values.size(), 1);
+  dh::CachingDeviceUVector<float> bins(n_bins);
+  HostDeviceVector<float> weights;
+  SmallHistogram(&ctx, indices, common::MakeOptionalWeights(&ctx, weights),
+                 linalg::MakeTensorView(&ctx, dh::ToSpan(bins), bins.size()));
+
+  std::vector<float> h_bins(n_bins);
+  dh::safe_cuda(cudaMemcpyAsync(h_bins.data(), bins.data(), dh::ToSpan(bins).size_bytes(),
+                                cudaMemcpyDefault, ctx.CUDACtx()->Stream()));
+  for (std::size_t i = 0; i < n_bins; ++i) {
+    ASSERT_EQ(h_bins[i], cnt);
+  }
+}
 }  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_math.cc b/tests/cpp/common/test_math.cc
new file mode 100644
index 000000000000..6ff3107b39df
--- /dev/null
+++ b/tests/cpp/common/test_math.cc
@@ -0,0 +1,16 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <numeric>  // for accumulate
+
+#include "../../../src/common/math.h"
+
+namespace xgboost::common {
+TEST(Math, Softmax) {
+  std::vector<float> values{2.0f, 2.0f, 3.0f, 4.0f};
+
+  Softmax(values.begin(), values.end());
+  ASSERT_NEAR(std::accumulate(values.cbegin(), values.cend(), 0.0f), 1.0f, 1e-5f);
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_param_array.cc b/tests/cpp/common/test_param_array.cc
index 77d8a00db66c..d354edcdca6b 100644
--- a/tests/cpp/common/test_param_array.cc
+++ b/tests/cpp/common/test_param_array.cc
@@ -16,7 +16,7 @@
 
 namespace xgboost::common {
 TEST(ParamArray, Float) {
-  ParamArray<float, false> values{"values"};
+  ParamArray<float> values{"values"};
   {
     std::istringstream sin{"1.1"};
     sin >> values;
@@ -44,19 +44,7 @@ TEST(ParamArray, Float) {
     }
   }
   {
-    ParamArray<float, true> values{"values"};
-    std::istringstream sin{"1.1"};
-    sin >> values;
-    ASSERT_EQ(values.size(), 1);
-    ASSERT_NEAR(values[0], 1.1, kRtEps);
-    std::ostringstream sout;
-    sout << values;
-    auto jarr = Json::Load(StringView{sout.str()});
-    ASSERT_TRUE(IsA<Number>(jarr));
-    ASSERT_NEAR(get<Number>(jarr), 1.1, kRtEps);
-  }
-  {
-    ParamArray<float, true> values{"values"};
+    ParamArray<float> values{"values"};
     std::istringstream sin{"[\"foo\"]"};
     ASSERT_THAT(
         [&] { sin >> values; },
@@ -67,10 +55,9 @@ TEST(ParamArray, Float) {
 
 namespace {
 struct TestParamArray : public XGBoostParameter<TestParamArray> {
-  ParamArray<float, false> test_key{"test_key", 0.2f};
+  ParamArray<float> test_key{"test_key", 0.2f};
   DMLC_DECLARE_PARAMETER(TestParamArray) {
-    DMLC_DECLARE_FIELD(test_key).describe("test").set_default(
-        ParamArray<float, false>{"test_key", 0.2f});
+    DMLC_DECLARE_FIELD(test_key).describe("test").set_default(ParamArray<float>{"test_key", 0.2f});
   }
 };
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index d960201d36de..dcf0a694c897 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -202,8 +202,14 @@ double GetMultiMetricEval(xgboost::Metric* metric,
 }
 
 namespace xgboost {
-float GetBaseScore(Json const &config) {
-  return std::stof(get<String const>(config["learner"]["learner_model_param"]["base_score"]));
+[[nodiscard]] std::vector<float> GetBaseScore(Json const& config) {
+  auto str = get<String const>(config["learner"]["learner_model_param"]["base_score"]);
+  auto jintercept = Json::Load(str);
+  auto const& array = get<Array const>(jintercept);
+  std::vector<float> results;
+  std::transform(array.begin(), array.end(), std::back_inserter(results),
+                 [](Json v) { return get<Number>(v); });
+  return results;
 }
 
 SimpleLCG::StateType SimpleLCG::operator()() {
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index a4d3559bce16..545612ffdc8a 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -110,8 +110,7 @@ double GetMultiMetricEval(xgboost::Metric* metric,
                           xgboost::DataSplitMode data_split_Mode = xgboost::DataSplitMode::kRow);
 
 namespace xgboost {
-
-float GetBaseScore(Json const &config);
+[[nodiscard]] std::vector<float> GetBaseScore(Json const& config);
 
 /*!
  * \brief Linear congruential generator.
diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc
index a1e0e3b8678b..e1165699b49d 100644
--- a/tests/cpp/objective/test_quantile_obj.cc
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -57,17 +57,17 @@ void TestQuantileIntercept(const Context* ctx) {
 
   linalg::Vector<float> base_scores;
   obj->InitEstimation(info, &base_scores);
-  ASSERT_EQ(base_scores.Size(), 1) << "Vector is not yet supported.";
-  // mean([5.6, 7.8])
-  ASSERT_NEAR(base_scores(0), 6.7, kRtEps);
+  ASSERT_EQ(base_scores.Size(), 2);
+  ASSERT_NEAR(base_scores(0), 5.6, kRtEps);
+  ASSERT_NEAR(base_scores(1), 7.8, kRtEps);
 
   for (std::size_t i = 0; i < info.num_row_; ++i) {
     info.weights_.HostVector().emplace_back(info.num_row_ - i - 1.0);
   }
 
   obj->InitEstimation(info, &base_scores);
-  ASSERT_EQ(base_scores.Size(), 1) << "Vector is not yet supported.";
-  // mean([3, 5])
-  ASSERT_NEAR(base_scores(0), 4.0, kRtEps);
+  ASSERT_EQ(base_scores.Size(), 2);
+  ASSERT_NEAR(base_scores(0), 3.0, kRtEps);
+  ASSERT_NEAR(base_scores(1), 5.0, kRtEps);
 }
 }  // namespace xgboost
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index 4218cf56e960..67327f349046 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -1,10 +1,13 @@
 /**
  * Copyright 2017-2025, XGBoost contributors
  */
+#include "test_regression_obj.h"
+
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
 #include <xgboost/json.h>
 #include <xgboost/objective.h>
+#include <xgboost/tree_model.h>  // for RegTree
 
 #include <numeric>  // for iota
 
@@ -16,9 +19,15 @@
 #include "xgboost/linalg.h"
 #include "xgboost/tree_model.h"  // for RegTree
 
-#include "test_regression_obj.h"
-
 namespace xgboost {
+namespace {
+void CheckProbaToMargin(std::unique_ptr<ObjFunction> const& obj, float in, float expect,
+                        float abs_error = 1e-2f) {
+  linalg::Vector<float> t{{in}, {1}, obj->Ctx()->Device()};
+  obj->ProbToMargin(&t);
+  ASSERT_NEAR(t(0), expect, abs_error);
+}
+}  // namespace
 
 void TestLinearRegressionGPair(const Context* ctx) {
   std::string obj_name = "reg:squarederror";
@@ -95,11 +104,10 @@ void TestLogisticRegressionBasic(const Context* ctx) {
     << "Expected error when label not in range [0,1f] for LogisticRegression";
 
   // test ProbToMargin
-  EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.197f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.5f), 0, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.9f), 2.197f, 0.01f);
-  EXPECT_ANY_THROW((void)obj->ProbToMargin(10))
-      << "Expected error when base_score not in range [0,1f] for LogisticRegression";
+  CheckProbaToMargin(obj, 0.1f, -2.197f);
+  CheckProbaToMargin(obj, 0.5f, 0);
+  CheckProbaToMargin(obj, 0.9f, 2.197f);
+  ASSERT_THAT([&] { CheckProbaToMargin(obj, 10, 0); }, GMockThrow("base_score must be in (0,1)"));
 
   // test PredTransform
   HostDeviceVector<bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
@@ -162,9 +170,9 @@ void TestPoissonRegressionBasic(const Context* ctx) {
     << "Expected error when label < 0 for PoissonRegression";
 
   // test ProbToMargin
-  EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.30f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.5f), -0.69f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.9f), -0.10f, 0.01f);
+  CheckProbaToMargin(obj, 0.1f, -2.30f);
+  CheckProbaToMargin(obj, 0.5f, -0.69f);
+  CheckProbaToMargin(obj, 0.9f, -0.10f);
 
   // test PredTransform
   HostDeviceVector<bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
@@ -211,9 +219,9 @@ void TestGammaRegressionBasic(const Context* ctx) {
     << "Expected error when label < 0 for GammaRegression";
 
   // test ProbToMargin
-  EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.30f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.5f), -0.69f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.9f), -0.10f, 0.01f);
+  CheckProbaToMargin(obj, 0.1f, -2.30f);
+  CheckProbaToMargin(obj, 0.5f, -0.69f);
+  CheckProbaToMargin(obj, 0.9f, -0.10f);
 
   // test PredTransform
   HostDeviceVector<bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
@@ -259,9 +267,9 @@ void TestTweedieRegressionBasic(const Context* ctx) {
     << "Expected error when label < 0 for TweedieRegression";
 
   // test ProbToMargin
-  EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.30f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.5f), -0.69f, 0.01f);
-  EXPECT_NEAR(obj->ProbToMargin(0.9f), -0.10f, 0.01f);
+  CheckProbaToMargin(obj, 0.1f, -2.30f);
+  CheckProbaToMargin(obj, 0.5f, -0.69f);
+  CheckProbaToMargin(obj, 0.9f, -0.10f);
 
   // test PredTransform
   HostDeviceVector<bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
diff --git a/tests/cpp/plugin/federated/test_federated_learner.cc b/tests/cpp/plugin/federated/test_federated_learner.cc
index 6dd641bc8a21..b7b6a6e27c1b 100644
--- a/tests/cpp/plugin/federated/test_federated_learner.cc
+++ b/tests/cpp/plugin/federated/test_federated_learner.cc
@@ -38,8 +38,9 @@ auto MakeModel(std::string tree_method, std::string device, std::string objectiv
   return model;
 }
 
-void VerifyObjective(std::size_t rows, std::size_t cols, float expected_base_score,
-                     Json expected_model, std::string const &tree_method, std::string device,
+void VerifyObjective(std::size_t rows, std::size_t cols,
+                     std::vector<float> const &expected_base_score, Json expected_model,
+                     std::string const &tree_method, std::string device,
                      std::string const &objective) {
   auto rank = collective::GetRank();
   std::shared_ptr<DMatrix> dmat =
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 7cee9b292fd2..0565dac4f621 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -217,12 +217,17 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_i
   auto& h_pred_0 = predict_0.HostVector();
   auto& h_pred_1 = predict_1.HostVector();
 
+  Json config {Object{}};
+  learner->SaveConfig(&config);
+  auto base_score = GetBaseScore(config);
+
   ASSERT_EQ(h_pred.size(), rows * kClasses);
   ASSERT_EQ(h_pred.size(), h_pred_0.size());
   ASSERT_EQ(h_pred.size(), h_pred_1.size());
   for (size_t i = 0; i < h_pred.size(); ++i) {
     // Need to remove the global bias here.
-    ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - 0.5f, kRtEps);
+    auto j = i % kClasses;
+    ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - base_score.at(j), kRtEps);
   }
 
   learner->SetParam("device", "cpu");
@@ -349,7 +354,7 @@ void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
-  LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
+  LearnerModelParam mparam{MakeMP(kCols, .5, 1, ctx.Device())};
   uint32_t split_ind = 3;
   bst_cat_t split_cat = 4;
   float left_weight = 1.3f;
@@ -392,7 +397,7 @@ void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
-  LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
+  LearnerModelParam mparam{MakeMP(kCols, .5, 1, ctx->Device())};
 
   uint32_t split_ind = 3;
   bst_cat_t split_cat = 4;
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 8f110efe06e8..9587cc8c7260 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -47,7 +47,7 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col
                                      std::shared_ptr<DMatrix> p_hist) {
   constexpr size_t kClasses { 3 };
 
-  LearnerModelParam mparam{MakeMP(cols, .5, kClasses)};
+  LearnerModelParam mparam{MakeMP(cols, .5, kClasses, ctx->Device())};
   auto cuda_ctx = MakeCUDACtx(0);
 
   std::unique_ptr<Predictor> predictor =
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index b0eb5766a871..e168f8076a7c 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -469,7 +469,8 @@ class InitBaseScore : public ::testing::Test {
     Json config{Object{}};
     learner->SaveConfig(&config);
     auto base_score = GetBaseScore(config);
-    ASSERT_NE(base_score, ObjFunction::DefaultBaseScore());
+    ASSERT_EQ(base_score.size(), 1);
+    ASSERT_NE(base_score[0], ObjFunction::DefaultBaseScore());
 
     // already initialized
     auto Xy1 = RandomDataGenerator{100, Cols(), 0}.Seed(321).GenerateDMatrix(true);
@@ -498,8 +499,9 @@ class InitBaseScore : public ::testing::Test {
     learner->SaveConfig(&config);
 
     auto base_score = GetBaseScore(config);
+    ASSERT_EQ(base_score.size(), 1);
     // no change
-    ASSERT_FLOAT_EQ(base_score, 1.3);
+    ASSERT_FLOAT_EQ(base_score[0], 1.3);
 
     HostDeviceVector<float> predt;
     learner->Predict(Xy_, false, &predt, 0, 0);
@@ -510,8 +512,9 @@ class InitBaseScore : public ::testing::Test {
     learner->UpdateOneIter(0, Xy_);
     learner->SaveConfig(&config);
     base_score = GetBaseScore(config);
+    ASSERT_EQ(base_score.size(), 1);
     // no change
-    ASSERT_FLOAT_EQ(base_score, 1.3);
+    ASSERT_FLOAT_EQ(base_score[0], 1.3);
 
     auto from_avg = std::stoi(
         get<String const>(config["learner"]["learner_model_param"]["boost_from_average"]));
@@ -534,7 +537,9 @@ class InitBaseScore : public ::testing::Test {
     Json model{Object{}};
     learner->SaveModel(&model);
     auto base_score = GetBaseScore(model);
-    ASSERT_EQ(base_score, ObjFunction::DefaultBaseScore());
+    ASSERT_EQ(base_score.size(), 1);
+    ASSERT_FALSE(std::isnan(base_score[0]));
+    ASSERT_EQ(base_score[0], ObjFunction::DefaultBaseScore());
 
     learner.reset(Learner::Create({Xy_}));
     learner->LoadModel(model);
@@ -542,12 +547,14 @@ class InitBaseScore : public ::testing::Test {
     learner->Configure();
     learner->SaveConfig(&config);
     base_score = GetBaseScore(config);
-    ASSERT_EQ(base_score, ObjFunction::DefaultBaseScore());
+    ASSERT_EQ(base_score[0], ObjFunction::DefaultBaseScore());
 
     learner->UpdateOneIter(0, Xy_);
     learner->SaveConfig(&config);
     base_score = GetBaseScore(config);
-    ASSERT_NE(base_score, ObjFunction::DefaultBaseScore());
+    ASSERT_EQ(base_score.size(), 1);
+    ASSERT_FALSE(std::isnan(base_score[0]));
+    ASSERT_NE(base_score[0], ObjFunction::DefaultBaseScore());
   }
 
   void TestInitWithPredt() {
@@ -564,13 +571,16 @@ class InitBaseScore : public ::testing::Test {
     Json config(Object{});
     learner->SaveConfig(&config);
     auto base_score = GetBaseScore(config);
-    ASSERT_EQ(base_score, ObjFunction::DefaultBaseScore());
+    ASSERT_EQ(base_score.size(), 1);
+    ASSERT_EQ(base_score[0], ObjFunction::DefaultBaseScore());
 
     // since prediction is not used for trianing, the train procedure still runs estimation
     learner->UpdateOneIter(0, Xy_);
     learner->SaveConfig(&config);
     base_score = GetBaseScore(config);
-    ASSERT_NE(base_score, ObjFunction::DefaultBaseScore());
+    ASSERT_EQ(base_score.size(), 1);
+    ASSERT_FALSE(std::isnan(base_score[0]));
+    ASSERT_NE(base_score[0], ObjFunction::DefaultBaseScore());
   }
 
   void TestUpdateProcess() {
@@ -584,6 +594,8 @@ class InitBaseScore : public ::testing::Test {
     Json model{Object{}};
     learner->SaveModel(&model);
     auto base_score = GetBaseScore(model);
+    ASSERT_EQ(base_score.size(), 1);
+    ASSERT_FALSE(std::isnan(base_score[0]));
 
     auto Xy1 = RandomDataGenerator{100, Cols(), 0}.Seed(321).GenerateDMatrix(true);
     learner.reset(Learner::Create({Xy1}));
@@ -595,6 +607,8 @@ class InitBaseScore : public ::testing::Test {
     Json config(Object{});
     learner->SaveConfig(&config);
     auto base_score1 = GetBaseScore(config);
+    ASSERT_EQ(base_score1.size(), 1);
+    ASSERT_FALSE(std::isnan(base_score1[0]));
     ASSERT_EQ(base_score, base_score1);
   }
 };
@@ -610,7 +624,8 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
 TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
 
 class TestColumnSplit : public ::testing::TestWithParam<std::string> {
-  void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
+  void TestBaseScore(std::string objective, std::vector<float> const& expected_base_score,
+                     Json expected_model) {
     auto const world_size = collective::GetWorldSize();
     auto n_threads = collective::GetWorkerLocalThreads(world_size);
     auto const rank = collective::GetRank();
diff --git a/tests/cpp/test_multi_target.cc b/tests/cpp/test_multi_target.cc
index 763d647751a5..cf926ae26563 100644
--- a/tests/cpp/test_multi_target.cc
+++ b/tests/cpp/test_multi_target.cc
@@ -81,10 +81,9 @@ class TestL1MultiTarget : public ::testing::Test {
 
     Json config{Object{}};
     learner->SaveConfig(&config);
-    auto base_score =
-        std::stod(get<String const>(config["learner"]["learner_model_param"]["base_score"]));
+    auto base_score = GetBaseScore(config);
 
-    std::vector<float> base_scores;
+    std::vector<float> split_scores;
     for (bst_target_t t{0}; t < p_fmat->Info().labels.Shape(1); ++t) {
       auto t_Xy = weight ? single_w_[t] : single_[t];
       std::unique_ptr<Learner> sl{Learner::Create({t_Xy})};
@@ -95,16 +94,14 @@ class TestL1MultiTarget : public ::testing::Test {
       sl->UpdateOneIter(0, t_Xy);
       Json s_config{Object{}};
       sl->SaveConfig(&s_config);
-      auto s_base_score =
-          std::stod(get<String const>(s_config["learner"]["learner_model_param"]["base_score"]));
+      auto s_base_score = GetBaseScore(s_config);
+      ASSERT_EQ(s_base_score.size(), 1);
       linalg::Vector<float> out;
       common::Median(sl->Ctx(), t_Xy->Info().labels, t_Xy->Info().weights_, &out);
-      ASSERT_FLOAT_EQ(s_base_score, out(0));
-      base_scores.push_back(s_base_score);
+      ASSERT_FLOAT_EQ(s_base_score[0], out(0));
+      split_scores.push_back(s_base_score[0]);
     }
-    auto mean = std::accumulate(base_scores.cbegin(), base_scores.cend(), .0f) /
-                static_cast<float>(base_scores.size());
-    ASSERT_FLOAT_EQ(mean, base_score);
+    ASSERT_EQ(split_scores, base_score);
   }
 
   void RunTest(Context const* ctx, std::string const& tree_method) {
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index 83b779c34211..d72875deaee8 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -14,6 +14,7 @@
     run_boost_from_prediction_binary,
     run_boost_from_prediction_multi_clasas,
     run_housing_rf_regression,
+    run_intercept,
     run_recoding,
 )
 
@@ -321,3 +322,7 @@ def worker(ordinal: int, correct_ordinal: bool) -> None:
 @pytest.mark.skipif(**tm.no_cudf())
 def test_recoding() -> None:
     run_recoding("cuda")
+
+
+def test_intercept() -> None:
+    run_intercept("cuda")
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 9f8564657c20..b08833d4958d 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -9,7 +9,7 @@
 from xgboost import testing as tm
 from xgboost.core import Integer
 from xgboost.testing.basic_models import run_custom_objective
-from xgboost.testing.updater import ResetStrategy
+from xgboost.testing.updater import ResetStrategy, get_basescore
 
 
 class TestModels:
@@ -396,7 +396,9 @@ def run_slice(
         predt_0 = sliced_0.predict(dtrain, output_margin=True)
         predt_1 = sliced_1.predict(dtrain, output_margin=True)
 
-        merged = predt_0 + predt_1 - 0.5  # base score.
+        # base score.
+        intercept = np.broadcast_to(np.array(get_basescore(booster)), predt_0.shape)
+        merged = predt_0 + predt_1 - intercept
         single = booster[1:7].predict(dtrain, output_margin=True)
         np.testing.assert_allclose(merged, single, atol=1e-6)
 
@@ -406,7 +408,7 @@ def run_slice(
         predt_0 = sliced_0.predict(dtrain, output_margin=True)
         predt_1 = sliced_1.predict(dtrain, output_margin=True)
 
-        merged = predt_0 + predt_1 - 0.5
+        merged = predt_0 + predt_1 - intercept
         single = booster[1:7].predict(dtrain, output_margin=True)
         np.testing.assert_allclose(merged, single, atol=1e-6)
 
diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py
index 8e8dacfa3b9f..fbb163d69658 100644
--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -35,7 +35,7 @@ def test_early_stopping_nonparallel(self):
         )
         clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
         base_score = get_basescore(clf3)
-        assert 0.53 > base_score > 0.5
+        assert 0.53 > base_score[0] > 0.5
 
         clf3 = xgb.XGBClassifier(
             learning_rate=0.1,
diff --git a/tests/python/test_model_compatibility.py b/tests/python/test_model_compatibility.py
index 41de6522554f..719a35562a40 100644
--- a/tests/python/test_model_compatibility.py
+++ b/tests/python/test_model_compatibility.py
@@ -11,6 +11,7 @@
 
 import xgboost
 from xgboost import testing as tm
+from xgboost.testing.updater import get_basescore
 
 
 def run_model_param_check(name: str, config: Dict[str, Any]) -> None:
@@ -32,7 +33,9 @@ def run_booster_check(booster: xgboost.Booster, name: str) -> None:
     n_rounds = get_n_rounds(name)
     if name.find("cls") != -1:
         assert len(booster.get_dump()) == gm.kForests * n_rounds * gm.kClasses
-        assert float(config["learner"]["learner_model_param"]["base_score"]) == 0.5
+        base_score = get_basescore(config)
+        assert isinstance(base_score, list)
+        assert all(v == 0.5 for v in base_score)
         assert config["learner"]["learner_train_param"]["objective"] == "multi:softmax"
     elif name.find("logitraw") != -1:
         assert len(booster.get_dump()) == gm.kForests * n_rounds
@@ -57,7 +60,7 @@ def run_booster_check(booster: xgboost.Booster, name: str) -> None:
     else:
         assert name.find("reg") != -1
         assert len(booster.get_dump()) == gm.kForests * n_rounds
-        assert float(config["learner"]["learner_model_param"]["base_score"]) == 0.5
+        assert get_basescore(config) == [0.5]
         assert (
             config["learner"]["learner_train_param"]["objective"] == "reg:squarederror"
         )
diff --git a/tests/python/test_model_io.py b/tests/python/test_model_io.py
index c1dc07888f5a..1ac21fd8a8fb 100644
--- a/tests/python/test_model_io.py
+++ b/tests/python/test_model_io.py
@@ -380,7 +380,7 @@ def test_sklearn_model() -> None:
         np.testing.assert_equal(clf.classes_, np.arange(10))
         assert clf.n_classes_ == 10
 
-        assert clf.best_iteration == 27
+        assert clf.best_iteration == 21
         assert clf.best_score == score
 
 
diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py
index fad4e04ece84..3ed9d1a0e607 100644
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@@ -213,7 +213,7 @@ def after_training(self, model) -> bool:
         lambdarank_pair_method="topk",
         objective="rank:ndcg",
         callbacks=[Position()],
-        boost_from_average=0,
+        base_score=0.5,
     )
     ltr.fit(x, c, qid=q, eval_set=[(x, c)], eval_qid=[q])
 
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 10b239457474..06b89f83ed5c 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -20,6 +20,7 @@
     run_boost_from_prediction_binary,
     run_boost_from_prediction_multi_clasas,
     run_housing_rf_regression,
+    run_intercept,
     run_recoding,
 )
 
@@ -879,7 +880,7 @@ def test_sklearn_get_default_params():
     assert cls.get_params()["base_score"] is None
     cls.fit(X[:4, ...], y[:4, ...])
     base_score = get_basescore(cls)
-    np.testing.assert_equal(base_score, 0.5)
+    np.testing.assert_equal(base_score, [0.5])
 
 
 def run_validation_weights(model):
@@ -1460,18 +1461,7 @@ def test_weighted_evaluation_metric():
 
 
 def test_intercept() -> None:
-    X, y, w = tm.make_regression(256, 3, use_cupy=False)
-    reg = xgb.XGBRegressor()
-    reg.fit(X, y, sample_weight=w)
-    result = reg.intercept_
-    assert result.dtype == np.float32
-    assert result[0] < 0.5
-
-    reg = xgb.XGBRegressor(booster="gblinear")
-    reg.fit(X, y, sample_weight=w)
-    result = reg.intercept_
-    assert result.dtype == np.float32
-    assert result[0] < 0.5
+    run_intercept("cpu")
 
 
 def test_fit_none() -> None:
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index d0c4f446b593..51527cb3fda1 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -310,7 +310,10 @@ def test_empty_quantile_dmatrix(self, local_cuda_client: Client) -> None:
         Xy_valid = dxgb.DaskQuantileDMatrix(
             client, X_valid, y_valid, ref=Xy, enable_categorical=True
         )
-        with pytest.raises(ValueError, match="empty"):
+        # The error is from a worker. Dask cannot prioritize which worker's error to
+        # propagate, it could be the emtpy DMatrix error or the collective communication
+        # error. As a result, the test doesn't match the error message.
+        with pytest.raises(ValueError):
             dxgb.train(
                 client,
                 {"tree_method": "hist", "device": "cuda", "debug_synchronize": True},
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 2b6fdec67306..9233f150f073 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -45,6 +45,7 @@
     validate_data_initialization,
     validate_leaf_output,
 )
+from xgboost.testing.updater import get_basescore
 
 dask.config.set({"distributed.scheduler.allowed-failures": False})
 
@@ -1528,9 +1529,6 @@ def test_approx(
         self.run_updater_test(client, params, num_rounds, dataset, "approx")
 
     def test_adaptive(self) -> None:
-        def get_score(config: Dict) -> float:
-            return float(config["learner"]["learner_model_param"]["base_score"])
-
         def local_test(rabit_args: Dict[str, Union[int, str]], worker_id: int) -> bool:
             with dxgb.CommunicatorContext(**rabit_args):
                 if worker_id == 0:
@@ -1551,8 +1549,8 @@ def local_test(rabit_args: Dict[str, Union[int, str]], worker_id: int) -> bool:
                     num_boost_round=1,
                 )
                 config = json.loads(booster.save_config())
-                base_score = get_score(config)
-                assert base_score == 250.0
+                base_score = get_basescore(config)
+                assert base_score == [250.0]
                 return True
 
         with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
diff --git a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
index 63bca3ca232f..4e40083552b8 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
@@ -306,7 +306,9 @@ def test_classifier_distributed_basic(self):
 
     def test_classifier_distributed_multiclass(self):
         # There is no built-in multiclass option for external storage
-        classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
+        classifier = SparkXGBClassifier(
+            num_workers=self.n_workers, n_estimators=100, base_score=0.5
+        )
         model = classifier.fit(self.cls_df_train_distributed_multiclass)
         pred_result = model.transform(self.cls_df_test_distributed_multiclass).collect()
         for row in pred_result:

From 6c50da466dcd6ff4d0390898251d4a3c70fc49eb Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 4 Sep 2025 13:27:58 -0700
Subject: [PATCH 160/224] [CI] Build and test XGBoost with CUDA 13 (#11662)

* [WIP] [CI] Build and test XGBoost with CUDA 13

* [WIP] [CI] Add workflow

* [WIP] [CI] Run gtest and pytest

* Run build before test

* Fix permissions

* Use latest containers
---
 .github/workflows/cuda13.yml                  | 89 +++++++++++++++++++
 cmake/Utils.cmake                             |  4 +-
 ops/pipeline/build-cuda-impl.sh               |  8 +-
 ops/pipeline/build-cuda.sh                    |  3 +-
 ops/pipeline/build-cuda13.sh                  | 58 ++++++++++++
 ops/pipeline/test-cpp-cuda13.sh               | 14 +++
 ops/pipeline/test-python-wheel-cuda13-impl.sh | 20 +++++
 ops/pipeline/test-python-wheel-cuda13.sh      | 15 ++++
 8 files changed, 208 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/cuda13.yml
 create mode 100755 ops/pipeline/build-cuda13.sh
 create mode 100755 ops/pipeline/test-cpp-cuda13.sh
 create mode 100755 ops/pipeline/test-python-wheel-cuda13-impl.sh
 create mode 100755 ops/pipeline/test-python-wheel-cuda13.sh

diff --git a/.github/workflows/cuda13.yml b/.github/workflows/cuda13.yml
new file mode 100644
index 000000000000..ff133ed1eeb9
--- /dev/null
+++ b/.github/workflows/cuda13.yml
@@ -0,0 +1,89 @@
+name: XGBoost CI (CUDA 13)
+
+on: [push, pull_request]
+
+permissions:
+  contents: read  # to fetch code (actions/checkout)
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  BRANCH_NAME: >-
+    ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+
+jobs:
+  build-cuda13:
+    name: Build CUDA 13
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-amd64-cpu
+      - tag=cuda13-build-cuda13
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: |
+          bash ops/pipeline/build-cuda13.sh
+      - name: Stash files
+        run: |
+          python3 ops/pipeline/manage-artifacts.py upload \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            build/testxgboost ./xgboost python-package/dist/*.whl
+  test-cpp-cuda13:
+    name: Google Test (C++) with CUDA 13
+    needs: [build-cuda13]
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-amd64-gpu
+      - tag=cuda13-test-cpp-cuda13
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - name: Unstash gtest
+        run: |
+          python3 ops/pipeline/manage-artifacts.py download \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --dest-dir build \
+            testxgboost
+          chmod +x build/testxgboost
+      - run: |
+          bash ops/pipeline/test-cpp-cuda13.sh
+  test-python-cuda13:
+    name: Run Python tests with CUDA 13
+    needs: [build-cuda13]
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-amd64-gpu
+      - tag=cuda13-test-python-cuda13
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - name: Unstash Python wheel
+        run: |
+          python3 ops/pipeline/manage-artifacts.py download \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --dest-dir wheelhouse \
+            *.whl xgboost
+          mv -v wheelhouse/xgboost .
+          chmod +x ./xgboost
+      - name: Run Python tests
+        run: bash ops/pipeline/test-python-wheel-cuda13.sh
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 28d43b471388..ebc38de68607 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -69,7 +69,9 @@ function(compute_cmake_cuda_archs archs)
 
   # Set up defaults based on CUDA varsion
   if(NOT CMAKE_CUDA_ARCHITECTURES)
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "12.8")
+    if(CUDA_VERSION VERSION_GREATER_EQUAL "13.0")
+      set(CMAKE_CUDA_ARCHITECTURES 75 80 90 100 120)
+    elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.8")
       set(CMAKE_CUDA_ARCHITECTURES 50 60 70 80 90 100 120)
     elseif(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
       set(CMAKE_CUDA_ARCHITECTURES 50 60 70 80 90)
diff --git a/ops/pipeline/build-cuda-impl.sh b/ops/pipeline/build-cuda-impl.sh
index 75cbaae03afe..d25b3c729dd8 100755
--- a/ops/pipeline/build-cuda-impl.sh
+++ b/ops/pipeline/build-cuda-impl.sh
@@ -11,6 +11,13 @@ else
   cmake_args=''
 fi
 
+if [[ "${USE_FEDERATED:-}" == 1 ]]
+then
+  cmake_args="${cmake_args} -DPLUGIN_FEDERATED=ON"
+else
+  cmake_args="${cmake_args} -DPLUGIN_FEDERATED=OFF"
+fi
+
 if [[ "${USE_RMM:-}" == 1 ]]
 then
   cmake_prefix_path='/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake'
@@ -30,7 +37,6 @@ cmake .. \
   -DUSE_CUDA=ON \
   -DUSE_OPENMP=ON \
   -DHIDE_CXX_SYMBOLS=ON \
-  -DPLUGIN_FEDERATED=ON \
   -DUSE_NCCL=ON \
   -DUSE_NCCL_LIB_PATH=ON \
   -DNCCL_INCLUDE_DIR=/usr/include \
diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh
index 02b8d7ecd9ea..3458719bf090 100755
--- a/ops/pipeline/build-cuda.sh
+++ b/ops/pipeline/build-cuda.sh
@@ -16,6 +16,7 @@ then
 fi
 image_repo="$1"
 rmm_flag="$2"
+export USE_FEDERATED=1
 
 # Validate RMM flag
 case "${rmm_flag}" in
@@ -52,7 +53,7 @@ set -x
 
 python3 ops/docker_run.py \
   --image-uri ${BUILD_IMAGE_URI} \
-  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM' \
+  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \
   -- ops/pipeline/build-cuda-impl.sh
 
 echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
diff --git a/ops/pipeline/build-cuda13.sh b/ops/pipeline/build-cuda13.sh
new file mode 100755
index 000000000000..a12c4d019a1a
--- /dev/null
+++ b/ops/pipeline/build-cuda13.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+## Build XGBoost with CUDA 13
+
+set -euo pipefail
+
+if [[ -z "${GITHUB_SHA:-}" ]]
+then
+  echo "Make sure to set environment variable GITHUB_SHA"
+  exit 1
+fi
+
+IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+export USE_RMM=0
+export USE_FEDERATED=0
+
+source ops/pipeline/classify-git-branch.sh
+source ops/pipeline/get-docker-registry-details.sh
+source ops/pipeline/get-image-tag.sh
+
+WHEEL_TAG=manylinux_2_28_x86_64
+BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
+MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
+
+echo "--- Build with CUDA"
+
+if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
+then
+  export BUILD_ONLY_SM75=1
+else
+  export BUILD_ONLY_SM75=0
+fi
+
+set -x
+
+# Remove nvidia-nccl-cu12 from the list of Python deps
+# nvidia-nccl-cu13 is not yet available on PyPI
+python3 ops/script/pypi_variants.py --use-cpu-suffix=0 --require-nccl-dep=0
+
+python3 ops/docker_run.py \
+  --image-uri ${BUILD_IMAGE_URI} \
+  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \
+  -- ops/pipeline/build-cuda-impl.sh
+
+echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
+python3 ops/docker_run.py \
+  --image-uri ${MANYLINUX_IMAGE_URI} \
+  -- auditwheel repair --only-plat \
+  --plat ${WHEEL_TAG} python-package/dist/*.whl
+python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
+  wheelhouse/*.whl
+mv -v wheelhouse/*.whl python-package/dist/
+if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then
+  echo "error: libgomp.so was not vendored in the wheel"
+  exit -1
+fi
+
+# Check size of wheel
+pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
diff --git a/ops/pipeline/test-cpp-cuda13.sh b/ops/pipeline/test-cpp-cuda13.sh
new file mode 100755
index 000000000000..2ccd7bea6abc
--- /dev/null
+++ b/ops/pipeline/test-cpp-cuda13.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -euox pipefail
+
+source ops/pipeline/get-docker-registry-details.sh
+source ops/pipeline/get-image-tag.sh
+
+IMAGE_REPO='xgb-ci.gpu_build_cuda13_rockylinux8'
+IMAGE_URI=${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}
+
+echo "--- Run Google Tests, using a single GPU, CUDA 13"
+python3 ops/docker_run.py --image-uri ${IMAGE_URI} --use-gpus \
+  --run-args='--privileged' \
+  -- build/testxgboost
diff --git a/ops/pipeline/test-python-wheel-cuda13-impl.sh b/ops/pipeline/test-python-wheel-cuda13-impl.sh
new file mode 100755
index 000000000000..4660a8b7f46a
--- /dev/null
+++ b/ops/pipeline/test-python-wheel-cuda13-impl.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+## Companion script for ops/pipeline/test-python-wheel-cuda13.sh
+
+set -eo pipefail
+# Cannot set -u before Conda env activation
+
+# Set up Conda env
+gosu root chown -R $(id -u):$(id -g) /opt/miniforge/envs /opt/miniforge/pkgs/cache
+gosu root chown $(id -u):$(id -g) /opt/miniforge/pkgs
+mamba create -y -n gpu_test python=3.12 pytest cupy scipy numpy pandas scikit-learn joblib hypothesis
+
+source activate gpu_test
+
+set -xu
+
+pip install -v ./wheelhouse/*.whl
+
+echo "-- Run Python tests, using a single GPU, CUDA 13"
+python -c 'from cupy.cuda import jitify; jitify._init_module()'
+pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu
diff --git a/ops/pipeline/test-python-wheel-cuda13.sh b/ops/pipeline/test-python-wheel-cuda13.sh
new file mode 100755
index 000000000000..279411779927
--- /dev/null
+++ b/ops/pipeline/test-python-wheel-cuda13.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+## Test XGBoost Python wheel on the Linux platform, CUDA 13
+
+set -euo pipefail
+
+source ops/pipeline/get-docker-registry-details.sh
+source ops/pipeline/get-image-tag.sh
+
+IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
+
+set -x
+python3 ops/docker_run.py --image-uri "${IMAGE_URI}" --use-gpus \
+  --run-args='--shm-size=4g --privileged' \
+  -- bash ops/pipeline/test-python-wheel-cuda13-impl.sh

From 446e3b962880cad187eaade4540f598b87415571 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <dmitry.razdoburdin@intel.com>
Date: Wed, 10 Sep 2025 03:55:50 +0200
Subject: [PATCH 161/224] optimize CPU inference with Array-Based Tree
 Traversal (#11519)

---
 src/predictor/array_tree_layout.h             | 226 ++++++++++++++++++
 src/predictor/cpu_predictor.cc                | 172 ++++++++++---
 tests/cpp/predictor/test_cpu_predictor.cc     |  75 ++++++
 .../test_sycl_training_continuation.py        |   4 +-
 4 files changed, 439 insertions(+), 38 deletions(-)
 create mode 100644 src/predictor/array_tree_layout.h

diff --git a/src/predictor/array_tree_layout.h b/src/predictor/array_tree_layout.h
new file mode 100644
index 000000000000..6b24c3c64556
--- /dev/null
+++ b/src/predictor/array_tree_layout.h
@@ -0,0 +1,226 @@
+/**
+ * Copyright 2021-2025, XGBoost Contributors
+ * \file array_tree_layout.cc
+ * \brief Implementation of array tree layout -- a powerfull inference optimization method.
+ */
+#ifndef XGBOOST_PREDICTOR_ARRAY_TREE_LAYOUT_H_
+#define XGBOOST_PREDICTOR_ARRAY_TREE_LAYOUT_H_
+
+#include <array>
+#include <limits>
+#include <type_traits>  // for conditional_t
+
+#include "../common/categorical.h"  // for IsCat
+#include "xgboost/tree_model.h"     // for RegTree
+
+namespace xgboost::predictor {
+
+/**
+ * @brief The class holds the array-based representation of the top levels of a single tree.
+ *
+ * @tparam has_categorical if the tree has categorical features
+ *
+ * @tparam any_missing if the class is able to process missing values
+ *
+ * @tparam kNumDeepLevels number of tree leveles being unrolled into array-based structure
+ */
+template <bool has_categorical, bool any_missing, int kNumDeepLevels>
+class ArrayTreeLayout {
+ private:
+  /* Number of nodes in the array based representation of the top levels of the tree
+   */
+  constexpr static size_t kNodesCount = (1u << kNumDeepLevels) - 1;
+
+  struct Empty {};
+  using DefaultLeftType =
+      typename std::conditional_t<any_missing, std::array<uint8_t, kNodesCount>, Empty>;
+  using IsCatType =
+      typename std::conditional_t<has_categorical, std::array<uint8_t, kNodesCount>, Empty>;
+  using CatSegmentType =
+      typename std::conditional_t<has_categorical,
+                                  std::array<common::Span<uint32_t const>, kNodesCount>, Empty>;
+
+  DefaultLeftType default_left_;
+  IsCatType is_cat_;
+  CatSegmentType cat_segment_;
+
+  std::array<bst_feature_t, kNodesCount> split_index_;
+  std::array<float, kNodesCount> split_cond_;
+  /* The nodes at tree levels 0, 1, ..., kNumDeepLevels - 1 are unrolled into an array-based structure.
+   *  If the tree has additional levels, this array stores the node indices of the sub-trees at level kNumDeepLevels.
+   *  This is necessary to continue processing nodes that are not eligible for array-based unrolling.
+   *  The number of sub-trees packed into this array is equal to the number of nodes at tree level kNumDeepLevels,
+   *  which is calculated as (1u << kNumDeepLevels) == kNodesCount + 1.
+   */
+  // Mapping from array node index to the RegTree node index.
+  std::array<bst_node_t, kNodesCount + 1> nidx_in_tree_;
+
+ /**
+ * @brief Traverse the top levels of original tree and fill internal arrays
+ *
+ * @tparam depth the tree level being processing
+ *
+ * @param tree the original tree
+ * @param cats matrix of categorical splits
+ * @param nidx_array node idx in the array layout
+ * @param nidx node idx in the original tree
+ */
+  template <int depth = 0>
+  void Populate(const RegTree& tree, RegTree::CategoricalSplitMatrix const& cats,
+                bst_node_t nidx_array = 0, bst_node_t nidx = 0) {
+    if constexpr (depth == kNumDeepLevels + 1) {
+      return;
+    } else if constexpr (depth == kNumDeepLevels) {
+        /* We store the node index in the original tree to ensure continued processing
+         * for nodes that are not eligible for array layout optimization.
+         */
+        nidx_in_tree_[nidx_array - kNodesCount] = nidx;
+    } else {
+      if (tree.IsLeaf(nidx)) {
+        split_index_[nidx_array]  = 0;
+
+        /*
+         * If the tree is not fully populated, we can reduce transfer costs.
+         * The values for the unpopulated parts of the tree are set to ensure
+         * that any move will always proceed in the "right" direction.
+         * This is achieved by exploiting the fact that comparisons with NaN always result in false.
+         */
+        if constexpr (any_missing) default_left_[nidx_array] = 0;
+        if constexpr (has_categorical) is_cat_[nidx_array] = 0;
+        split_cond_[nidx_array]   = std::numeric_limits<float>::quiet_NaN();
+
+        Populate<depth + 1>(tree, cats, 2 * nidx_array + 2, nidx);
+      } else {
+        if constexpr (any_missing) default_left_[nidx_array] = tree.DefaultLeft(nidx);
+        if constexpr (has_categorical) {
+          is_cat_[nidx_array] = common::IsCat(cats.split_type, nidx);
+          if (is_cat_[nidx_array]) {
+            cat_segment_[nidx_array] = cats.categories.subspan(cats.node_ptr[nidx].beg,
+                                                               cats.node_ptr[nidx].size);
+          }
+        }
+
+        split_index_[nidx_array]  = tree.SplitIndex(nidx);
+        split_cond_[nidx_array]   = tree.SplitCond(nidx);
+
+        /*
+         * LeftChild is used to determine if a node is a leaf, so it is always a valid value.
+         * However, RightChild can be invalid in some exotic cases.
+         * A tree with an invalid RightChild can still be correctly processed using classical methods
+         * if the split conditions are correct.
+         * However, in an array layout, an invalid RightChild, even if unreachable, can lead to memory corruption.
+         * A check should be added to prevent this.
+         */
+        Populate<depth + 1>(tree, cats, 2 * nidx_array + 1, tree.LeftChild(nidx));
+        bst_node_t right_child = tree.RightChild(nidx);
+        if (right_child != RegTree::kInvalidNodeId) {
+          Populate<depth + 1>(tree, cats, 2 * nidx_array + 2, right_child);
+        }
+      }
+    }
+  }
+
+  bool GetDecision(float fvalue, bst_node_t nidx) const {
+    if constexpr (has_categorical) {
+      if (is_cat_[nidx]) {
+       return common::Decision(cat_segment_[nidx], fvalue);
+      } else {
+        return fvalue < split_cond_[nidx];
+      }
+    } else {
+      return fvalue < split_cond_[nidx];
+    }
+  }
+
+ public:
+  /* Ad-hoc value.
+   * Increasing doesn't lead to perf gain, since bottleneck is now at gather instructions.
+   */
+  constexpr static int kMaxNumDeepLevels = 6;
+  static_assert(kNumDeepLevels <= kMaxNumDeepLevels);
+
+  ArrayTreeLayout(const RegTree& tree, RegTree::CategoricalSplitMatrix const &cats) {
+    Populate(tree, cats);
+  }
+
+  const auto& SplitIndex() const {
+    return split_index_;
+  }
+
+  const auto& SplitCond() const {
+    return split_cond_;
+  }
+
+  const auto& DefaultLeft() const {
+    return default_left_;
+  }
+
+  const auto& NidxInTree() const {
+    return nidx_in_tree_;
+  }
+
+  /**
+   * @brief Traverse the top levels of the tree for the entire block_size.
+   *
+   * In the array layout, it is organized to guarantee that if a node at the current level
+   * has index nidx, then the node index for the left child at the next level is always
+   * 2*nidx, and the node index for the right child at the next level is always 2*nidx+1.
+   * This greatly improves data locality.
+   *
+   * @param fvec_tloc buffer holding the feature values
+   * @param block_size size of the current block (1 < block_size <= 64)
+   * @param p_nidx Pointer to the vector of node indexes in the original tree with size
+   *               equals to the block size. (One node per sample). The value corresponds
+   *               to the level next after kNumDeepLevels
+   */
+  void Process(common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
+               bst_node_t* p_nidx) {
+    for (int depth = 0; depth < kNumDeepLevels; ++depth) {
+      std::size_t first_node = (1u << depth) - 1;
+
+      for (std::size_t i = 0; i < block_size; ++i) {
+        bst_node_t idx = p_nidx[i];
+
+        const auto& feat = fvec_tloc[i];
+        bst_feature_t split = split_index_[first_node + idx];
+        auto fvalue = feat.GetFvalue(split);
+        if constexpr (any_missing) {
+          bool go_left = feat.IsMissing(split) ? default_left_[first_node + idx]
+                                               : GetDecision(fvalue, first_node + idx);
+          p_nidx[i] = 2 * idx + !go_left;
+        } else {
+          p_nidx[i] = 2 * idx + !GetDecision(fvalue, first_node + idx);
+        }
+      }
+    }
+    // Remap to the original index.
+    for (std::size_t i = 0; i < block_size; ++i) {
+      p_nidx[i] = nidx_in_tree_[p_nidx[i]];
+    }
+  }
+};
+
+template <bool has_categorical, bool any_missing, int num_deep_levels = 1>
+void ProcessArrayTree(const RegTree& tree, RegTree::CategoricalSplitMatrix const& cats,
+                      common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
+                      bst_node_t* p_nidx, int tree_depth) {
+  constexpr int kMaxNumDeepLevels =
+      ArrayTreeLayout<has_categorical, any_missing, 0>::kMaxNumDeepLevels;
+
+  // Fill the array tree, then output predicted node idx.
+  if constexpr (num_deep_levels == kMaxNumDeepLevels) {
+    ArrayTreeLayout<has_categorical, any_missing, num_deep_levels> buffer(tree, cats);
+    buffer.Process(fvec_tloc, block_size, p_nidx);
+  } else {
+    if (tree_depth <= num_deep_levels) {
+      ArrayTreeLayout<has_categorical, any_missing, num_deep_levels> buffer(tree, cats);
+      buffer.Process(fvec_tloc, block_size, p_nidx);
+    } else {
+      ProcessArrayTree<has_categorical, any_missing, num_deep_levels + 1>
+        (tree, cats, fvec_tloc, block_size, p_nidx, tree_depth);
+    }
+  }
+}
+
+}  // namespace xgboost::predictor
+#endif  // XGBOOST_PREDICTOR_ARRAY_TREE_LAYOUT_H_
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 93c9fffea031..b99beb88c7b4 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -23,6 +23,7 @@
 #include "../gbm/gbtree_model.h"              // for GBTreeModel, GBTreeModelParam
 #include "dmlc/registry.h"                    // for DMLC_REGISTRY_FILE_TAG
 #include "predict_fn.h"                       // for GetNextNode, GetNextNodeMulti
+#include "array_tree_layout.h"                // for ProcessArrayTree
 #include "treeshap.h"                         // for CalculateContributions
 #include "utils.h"                            // for CheckProxyDMatrix
 #include "xgboost/base.h"                     // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
@@ -44,8 +45,7 @@ DMLC_REGISTRY_FILE_TAG(cpu_predictor);
 namespace scalar {
 template <bool has_missing, bool has_categorical>
 bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
-                        RegTree::CategoricalSplitMatrix const &cats) {
-  bst_node_t nidx{0};
+                        RegTree::CategoricalSplitMatrix const &cats, bst_node_t nidx) {
   while (!tree[nidx].IsLeaf()) {
     bst_feature_t split_index = tree[nidx].SplitIndex();
     auto fvalue = feat.GetFvalue(split_index);
@@ -57,19 +57,47 @@ bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
 
 template <bool has_categorical>
 [[nodiscard]] float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
-                                       RegTree::CategoricalSplitMatrix const &cats) noexcept(true) {
+                                       RegTree::CategoricalSplitMatrix const &cats,
+                                       bst_node_t nidx) noexcept(true) {
   const bst_node_t leaf = p_feats.HasMissing()
-                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
-                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
+                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats, nidx)
+                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats, nidx);
   return tree[leaf].LeafValue();
 }
+
+template <bool has_categorical, bool any_missing, bool use_array_tree_layout>
+void PredValueByOneTree(const RegTree& tree,
+                        std::size_t const predict_offset,
+                        common::Span<RegTree::FVec> fvec_tloc,
+                        std::size_t const block_size,
+                        linalg::MatrixView<float> out_predt,
+                        bst_node_t* p_nidx, int depth, int gid) {
+  auto const &cats = tree.GetCategoriesMatrix();
+  if constexpr (use_array_tree_layout) {
+    ProcessArrayTree<has_categorical, any_missing>(tree, cats, fvec_tloc, block_size, p_nidx,
+                                                   depth);
+  }
+  for (std::size_t i = 0; i < block_size; ++i) {
+    bst_node_t nidx = 0;
+    /*
+     * If array_tree_layout was used, we start processing from the nidx calculated using
+     * the array tree.
+     */
+    if constexpr (use_array_tree_layout) {
+      nidx = p_nidx[i];
+      p_nidx[i] = 0;
+    }
+    out_predt(predict_offset + i, gid) +=
+        PredValueByOneTree<has_categorical>(fvec_tloc[i], tree, cats, nidx);
+  }
+}
 }  // namespace scalar
 
 namespace multi {
 template <bool has_missing, bool has_categorical>
 bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
-                        RegTree::CategoricalSplitMatrix const &cats) {
-  bst_node_t nidx{0};
+                        RegTree::CategoricalSplitMatrix const &cats,
+                        bst_node_t nidx) {
   while (!tree.IsLeaf(nidx)) {
     bst_feature_t split_index = tree.SplitIndex(nidx);
     auto fvalue = feat.GetFvalue(split_index);
@@ -82,61 +110,114 @@ bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
 template <bool has_categorical>
 void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tree,
                         RegTree::CategoricalSplitMatrix const &cats,
-                        linalg::VectorView<float> out_predt) {
+                        linalg::VectorView<float> out_predt, bst_node_t nidx) {
   bst_node_t const leaf = p_feats.HasMissing()
-                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
-                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
+                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats, nidx)
+                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats, nidx);
   auto leaf_value = tree.LeafValue(leaf);
   assert(out_predt.Shape(0) == leaf_value.Shape(0) && "shape mismatch.");
   for (size_t i = 0; i < leaf_value.Size(); ++i) {
     out_predt(i) += leaf_value(i);
   }
 }
+
+template <bool has_categorical, bool any_missing, bool use_array_tree_layout>
+void PredValueByOneTree(const RegTree &tree, std::size_t const predict_offset,
+                        common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
+                        linalg::MatrixView<float> out_predt, bst_node_t *p_nidx, bst_node_t depth) {
+  const auto &mt_tree = *(tree.GetMultiTargetTree());
+  auto const &cats = tree.GetCategoriesMatrix();
+  if constexpr (use_array_tree_layout) {
+    ProcessArrayTree<has_categorical, any_missing>(tree, cats, fvec_tloc, block_size, p_nidx,
+                                                   depth);
+  }
+  for (std::size_t i = 0; i < block_size; ++i) {
+    bst_node_t nidx = 0;
+    if constexpr (use_array_tree_layout) {
+      nidx = p_nidx[i];
+      p_nidx[i] = 0;
+    }
+    auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+    PredValueByOneTree<has_categorical>(fvec_tloc[i], mt_tree, cats, t_predts, nidx);
+  }
+}
 }  // namespace multi
 
 namespace {
+template <bool use_array_tree_layout, bool any_missing>
 void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begin,
                             bst_tree_t const tree_end, std::size_t const predict_offset,
                             common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
-                            linalg::MatrixView<float> out_predt) {
+                            linalg::MatrixView<float> out_predt,
+                            const std::vector<int>& tree_depth) {
+  std::vector<bst_node_t> nidx;
+  if constexpr (use_array_tree_layout) {
+    nidx.resize(block_size, 0);
+  }
   for (bst_tree_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
     auto const &tree = *model.trees.at(tree_id);
-    auto const &cats = tree.GetCategoriesMatrix();
     bool has_categorical = tree.HasCategoricalSplit();
 
+    int depth = use_array_tree_layout ? tree_depth[tree_id - tree_begin] : 0;
     if (tree.IsMultiTarget()) {
       if (has_categorical) {
-        for (std::size_t i = 0; i < block_size; ++i) {
-          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-          multi::PredValueByOneTree<true>(fvec_tloc[i], *tree.GetMultiTargetTree(), cats, t_predts);
-        }
+        multi::PredValueByOneTree<true, any_missing, use_array_tree_layout>
+          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
       } else {
-        for (std::size_t i = 0; i < block_size; ++i) {
-          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-          multi::PredValueByOneTree<false>(fvec_tloc[i], *tree.GetMultiTargetTree(), cats,
-                                           t_predts);
-        }
+        multi::PredValueByOneTree<false, any_missing, use_array_tree_layout>
+          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
       }
     } else {
       auto const gid = model.tree_info[tree_id];
       if (has_categorical) {
-        for (std::size_t i = 0; i < block_size; ++i) {
-          out_predt(predict_offset + i, gid) +=
-              scalar::PredValueByOneTree<true>(fvec_tloc[i], tree, cats);
-        }
+        scalar::PredValueByOneTree<true, any_missing, use_array_tree_layout>
+          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth, gid);
       } else {
-        for (std::size_t i = 0; i < block_size; ++i) {
-          out_predt(predict_offset + i, gid) +=
-              scalar::PredValueByOneTree<false>(fvec_tloc[i], tree, cats);
+        scalar::PredValueByOneTree<false, any_missing, use_array_tree_layout>
+          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth, gid);
+      }
+    }
+  }
+}
+
+// Dispatch between template implementations
+void DispatchArrayLayout(gbm::GBTreeModel const &model, bst_tree_t const tree_begin,
+                         bst_tree_t const tree_end, std::size_t const predict_offset,
+                         common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
+                         linalg::MatrixView<float> out_predt, const std::vector<int> &tree_depth,
+                         bool any_missing) {
+  /*
+   * We transform trees to array layout for each block of data to avoid memory overheads.
+   * It makes the array layout inefficient for block_size == 1
+   */
+  const bool use_array_tree_layout = block_size > 1;
+  if (use_array_tree_layout) {
+    // Recheck if the current block has missing values.
+    if (any_missing) {
+      any_missing = false;
+      for (std::size_t i = 0; i < block_size; ++i) {
+        any_missing |= fvec_tloc[i].HasMissing();
+        if (any_missing) {
+          break;
         }
       }
     }
+    if (any_missing) {
+      PredictBlockByAllTrees<true, true>(model, tree_begin, tree_end, predict_offset, fvec_tloc,
+                                         block_size, out_predt, tree_depth);
+    } else {
+      PredictBlockByAllTrees<true, false>(model, tree_begin, tree_end, predict_offset, fvec_tloc,
+                                          block_size, out_predt, tree_depth);
+    }
+  } else {
+    PredictBlockByAllTrees<false, true>(model, tree_begin, tree_end, predict_offset, fvec_tloc,
+                                        block_size, out_predt, tree_depth);
   }
 }
 
 bool ShouldUseBlock(DMatrix *p_fmat) {
   // Threshold to use block-based prediction.
-  constexpr double kDensityThresh = .5;
+  constexpr double kDensityThresh = .125;
   bst_idx_t n_samples = p_fmat->Info().num_row_;
   bst_idx_t total = std::max(n_samples * p_fmat->Info().num_col_, static_cast<bst_idx_t>(1));
   double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
@@ -379,6 +460,7 @@ struct LaunchConfig : public Args... {
       }
     } else {
       for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
+        // bool any_missing = !page.IsDense();
         fn(SparsePageView{page.GetView(), page.base_rowid, acc});
       }
     }
@@ -458,18 +540,32 @@ template <std::size_t kBlockOfRowsSize, typename DataView>
 void PredictBatchByBlockKernel(DataView const &batch, gbm::GBTreeModel const &model,
                                bst_tree_t tree_begin, bst_tree_t tree_end,
                                ThreadTmp<kBlockOfRowsSize> *p_fvec, std::int32_t n_threads,
+                               bool any_missing,
                                linalg::TensorView<float, 2> out_predt) {
   auto &fvec = *p_fvec;
   // Parallel over local batches
   auto const n_samples = batch.Size();
   auto const n_features = model.learner_model_param->num_feature;
 
+  /* Precalculate depth for each tree.
+   * These values are required only for the ArrayLayout optimization,
+   * so we don't need them if kBlockOfRowsSize == 1
+   */
+  std::vector<int> tree_depth;
+  if constexpr (kBlockOfRowsSize > 1) {
+    tree_depth.resize(tree_end - tree_begin);
+    common::ParallelFor(tree_end - tree_begin, n_threads, [&](auto i) {
+      bst_tree_t tree_id = tree_begin + i;
+      tree_depth[i] = model.trees.at(tree_id)->MaxDepth();
+    });
+  }
+
   common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads, [&](auto &&block) {
     auto fvec_tloc = fvec.ThreadBuffer(block.Size());
 
     batch.FVecFill(block, n_features, fvec_tloc);
-    PredictBlockByAllTrees(model, tree_begin, tree_end, block.begin() + batch.base_rowid, fvec_tloc,
-                           block.Size(), out_predt);
+    DispatchArrayLayout(model, tree_begin, tree_end, block.begin() + batch.base_rowid, fvec_tloc,
+                        block.Size(), out_predt, tree_depth, any_missing);
     batch.FVecDrop(fvec_tloc);
   });
 }
@@ -802,13 +898,15 @@ class CPUPredictor : public Predictor {
     bst_idx_t n_samples = p_fmat->Info().num_row_;
     CHECK_EQ(out_preds->size(), n_samples * n_groups);
     auto out_predt = linalg::MakeTensorView(ctx_, *out_preds, n_samples, n_groups);
+    bool any_missing = !(p_fmat->IsDense());
 
     LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
       using Policy = common::GetValueT<decltype(policy)>;
       ThreadTmp<Policy::kBlockOfRowsSize> feat_vecs{n_threads};
       policy.ForEachBatch([&](auto &&batch) {
         PredictBatchByBlockKernel<Policy::kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
-                                                            &feat_vecs, n_threads, out_predt);
+                                                            &feat_vecs, n_threads, any_missing,
+                                                            out_predt);
       });
     });
   }
@@ -895,6 +993,7 @@ class CPUPredictor : public Predictor {
 
     this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
     auto &predictions = out_preds->predictions.HostVector();
+    bool any_missing = true;
 
     auto const n_threads = this->ctx_->Threads();
     // Always use block as we don't know the nnz.
@@ -904,7 +1003,8 @@ class CPUPredictor : public Predictor {
     auto kernel = [&](auto &&view) {
       auto out_predt = linalg::MakeTensorView(ctx_, predictions, view.Size(), n_groups);
       PredictBatchByBlockKernel<BlockPolicy::kBlockOfRowsSize>(view, model, tree_begin, tree_end,
-                                                               &feat_vecs, n_threads, out_predt);
+                                                               &feat_vecs, n_threads, any_missing,
+                                                               out_predt);
     };
     auto dispatch = [&](auto x) {
       using AdapterT = typename decltype(x)::element_type;
@@ -963,12 +1063,12 @@ class CPUPredictor : public Predictor {
           for (bst_tree_t j = 0; j < ntree_limit; ++j) {
             auto const &tree = *model.trees[j];
             auto const &cats = tree.GetCategoriesMatrix();
-            bst_node_t nidx;
+            bst_node_t nidx = 0;
             if (tree.IsMultiTarget()) {
               nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), fvec_tloc.front(),
-                                                     cats);
+                                                     cats, nidx);
             } else {
-              nidx = scalar::GetLeafIndex<true, true>(tree, fvec_tloc.front(), cats);
+              nidx = scalar::GetLeafIndex<true, true>(tree, fvec_tloc.front(), cats, nidx);
             }
             preds[ridx * ntree_limit + j] = static_cast<float>(nidx);
           }
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index de9309c358af..bc9df71c7641 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -7,6 +7,7 @@
 #include "../../../src/collective/communicator-inl.h"
 #include "../../../src/data/adapter.h"
 #include "../../../src/data/proxy_dmatrix.h"
+#include "../../../src/predictor/array_tree_layout.h"
 #include "../../../src/gbm/gbtree.h"
 #include "../../../src/gbm/gbtree_model.h"
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
@@ -22,6 +23,80 @@ TEST(CpuPredictor, Basic) {
   TestBasic(dmat.get(), &ctx);
 }
 
+
+template <typename ArrayLayoutT>
+void CheckArrayLayout(const RegTree& tree, ArrayLayoutT buffer, int max_depth, int depth, size_t nid, size_t nid_array) {
+  const auto& split_idx = buffer.SplitIndex();
+  const auto& split_cond = buffer.SplitCond();
+  const auto& default_left = buffer.DefaultLeft();
+  const auto& nidx_in_tree = buffer.NidxInTree();
+  const auto& nodes = tree.GetNodes();
+
+  if (depth == max_depth) {
+    ASSERT_EQ(nidx_in_tree[nid_array - (1u << max_depth) + 1], nid);
+    return;
+  }
+
+  if (nodes[nid].IsLeaf()) {
+    ASSERT_EQ(default_left[nid_array], 0);
+    ASSERT_TRUE(std::isnan(split_cond[nid_array]));
+
+    CheckArrayLayout(tree, buffer, max_depth, depth + 1, nid, 2 * nid_array + 2);
+  } else {
+    ASSERT_EQ(nodes[nid].SplitIndex(), split_idx[nid_array]);
+    ASSERT_EQ(nodes[nid].SplitCond(), split_cond[nid_array]);
+    ASSERT_EQ(nodes[nid].DefaultLeft(), default_left[nid_array]);
+
+    if (nodes[nid].LeftChild() != RegTree::kInvalidNodeId) {
+      CheckArrayLayout(tree, buffer, max_depth, depth + 1, nodes[nid].LeftChild(), 2 * nid_array + 1);
+    }
+    if (nodes[nid].RightChild() != RegTree::kInvalidNodeId) {
+      CheckArrayLayout(tree, buffer, max_depth, depth + 1, nodes[nid].RightChild(), 2 * nid_array + 2);
+    }
+  }
+}
+
+TEST(CpuPredictor, ArrayTreeLayout) {
+  Context ctx;
+
+  RegTree tree;
+  size_t n_nodes = 15; // 2^4 - 1
+  for (size_t nid = 0; nid < n_nodes; ++nid) {
+    // Some place-holders
+    size_t split_index = nid + 1;
+    bst_float split_cond = nid + 2;
+    bool default_left = nid % 2 == 0;
+
+    tree.ExpandNode(nid, split_index, split_cond, default_left, 0, 0, 0, 0, 0, 0, 0);
+  }
+
+  {
+    constexpr int kDepth = 1;
+    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
+  }
+  {
+    constexpr int kDepth = 2;
+    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
+  }
+  {
+    constexpr int kDepth = 3;
+    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
+  }
+  {
+    constexpr int kDepth = 4;
+    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
+  }
+  {
+    constexpr int kDepth = 5;
+    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
+  }
+}
+
 namespace {
 void TestColumnSplit() {
   Context ctx;
diff --git a/tests/python-sycl/test_sycl_training_continuation.py b/tests/python-sycl/test_sycl_training_continuation.py
index e2a11c987bb4..71d5965600e7 100644
--- a/tests/python-sycl/test_sycl_training_continuation.py
+++ b/tests/python-sycl/test_sycl_training_continuation.py
@@ -9,8 +9,8 @@ class TestSYCLTrainingContinuation:
     def run_training_continuation(self, use_json):
         kRows = 64
         kCols = 32
-        X = np.random.randn(kRows, kCols)
-        y = np.random.randn(kRows)
+        X = rng.randn(kRows, kCols)
+        y = rng.randn(kRows)
         dtrain = xgb.DMatrix(X, y)
         params = {
             "device": "sycl",

From a14336b0f724962f9ad269e77bc542d41dbcda52 Mon Sep 17 00:00:00 2001
From: Kunal Jani <168258319+kunaljani1100@users.noreply.github.com>
Date: Wed, 10 Sep 2025 02:15:55 -0700
Subject: [PATCH 162/224] [jvm][doc] External checkpoint manager document.
 (#11682)

---
 .../java/ExternalCheckpointManager.java       | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
index d5b8e8b9cdea..a5d79b86044a 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
@@ -26,6 +26,12 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 
+/**
+ * This class contains the methods that are required for managing the state of the training
+ * process. The training state is stored in a distributed file system, that consists of
+ * UBJ (Universal Binary JSON) model files.
+ * The class provides methods for saving, loading and cleaning up checkpoints.
+ */
 public class ExternalCheckpointManager {
 
   private Log logger = LogFactory.getLog("ExternalCheckpointManager");
@@ -33,6 +39,14 @@ public class ExternalCheckpointManager {
   private Path checkpointPath;  // directory for checkpoints
   private FileSystem fs;
 
+  /**
+   * This constructor creates a new Expternal Checkpoint Manager at the specified path in the
+   * specified file system.
+   *
+   * @param checkpointPath The directory path where checkpoints will be stored.
+   * @param fs The file system to use for storing checkpoints.
+   * @throws XGBoostError the error that is thrown is the checkpoint path is null or empty.
+   */
   public ExternalCheckpointManager(String checkpointPath, FileSystem fs) throws XGBoostError {
     if (checkpointPath == null || checkpointPath.isEmpty()) {
       throw new XGBoostError("cannot create ExternalCheckpointManager with null or" +
@@ -65,10 +79,24 @@ private Integer latest(List<Integer> versions) {
         .max(Comparator.comparing(Integer::valueOf)).get();
   }
 
+  /**
+   * This method cleans all the directories and files that are present in the checkpoint path.
+   * @throws IOException exception that is thrown when there is an error deleting the
+   * checkpoint path.
+   */
   public void cleanPath() throws IOException {
     fs.delete(checkpointPath, true);
   }
 
+  /**
+   * Read the checkpoint from the checkpoint path. Once the checkpoint path is read, we get
+   * the latest version of the checkpoint from all the checkpoint versions and lead it
+   * into the booster for the purpose of making predictions.
+   *
+   * @return The booster object that is used for making predictions.
+   * @throws IOException Any expection that occurs when reading the checkpoint path.
+   * @throws XGBoostError Any exception that occurs when loading the model into the booster.
+   */
   public Booster loadCheckpointAsBooster() throws IOException, XGBoostError {
     List<Integer> versions = getExistingVersions();
     if (versions.size() > 0) {
@@ -83,6 +111,15 @@ public Booster loadCheckpointAsBooster() throws IOException, XGBoostError {
     }
   }
 
+  /**
+   * This method updates the booster checkpoint to the the latest or current
+   * version and deleted all the previous versions of the checkpoint.
+   * @param boosterToCheckpoint The booster object that is to be checkpointed and
+   *                            saved as a model file.
+   * @throws IOException Any exception that occurs when writing the model file to the
+   * checkpoint path.
+   * @throws XGBoostError Any exception that occurs when saving the model from the booster.
+   */
   public void updateCheckpoint(Booster boosterToCheckpoint) throws IOException, XGBoostError {
     List<String> prevModelPaths = getExistingVersions().stream()
         .map(this::getPath).collect(Collectors.toList());
@@ -105,6 +142,13 @@ public void updateCheckpoint(Booster boosterToCheckpoint) throws IOException, XG
     }
   }
 
+  /**
+   * This method cleans up all the checkpoint versions that are higher than the current round.
+   * This is useful when multiple training instances are running and we want to make sure that
+   * only the checkpoints from the current training instance are retained.
+   * @param currentRound The current round of training.
+   * @throws IOException Any exception that occurs when deleting the checkpoint files.
+   */
   public void cleanUpHigherVersions(int currentRound) throws IOException {
     getExistingVersions().stream().filter(v -> v > currentRound).forEach(v -> {
       try {
@@ -114,7 +158,15 @@ public void cleanUpHigherVersions(int currentRound) throws IOException {
       }
     });
   }
-  // Get a list of iterations that need checkpointing.
+
+  /**
+   * Get a list of iterations that need checkpointing.
+   * @param firstRound The first round of training.
+   * @param checkpointInterval The interval at which checkpoints are to be saved.
+   * @param numOfRounds The number of rounds to be trained.
+   * @return A list of integer rounds that need checkpointing.
+   * @throws IOException Any exception that occurs when getting the list of rounds.
+   */
   public List<Integer> getCheckpointRounds(
       int firstRound, int checkpointInterval, int numOfRounds)
       throws IOException {

From acce37f43a3f4687afe09c88e21b3cdaa5c6906b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 10 Sep 2025 22:51:08 +0800
Subject: [PATCH 163/224] Remove wheels for manylinux2014. (#11673)

---
 .github/workflows/main.yml                    | 25 ---------
 .../build-python-wheels-manylinux2014.sh      | 54 -------------------
 ops/script/release_artifacts.py               |  4 --
 3 files changed, 83 deletions(-)
 delete mode 100755 ops/pipeline/build-python-wheels-manylinux2014.sh

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ddd052ef872a..cdd040acad4c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -122,31 +122,6 @@ jobs:
             --prefix cache/${{ github.run_id }}/build-python-wheels-arm64 \
             ./xgboost python-package/dist/*.whl
 
-  build-python-wheels-manylinux2014:
-    name: Build manylinux2014_${{ matrix.arch }} wheel
-    runs-on:
-      - runs-on
-      - runner=${{ matrix.runner }}
-      - run-id=${{ github.run_id }}
-      - tag=main-build-python-wheels-manylinux2014-${{ matrix.arch }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        - arch: aarch64
-          runner: linux-arm64-cpu
-        - arch: x86_64
-          runner: linux-amd64-cpu
-    steps:
-      # Restart Docker daemon so that it recognizes the ephemeral disks
-      - run: sudo systemctl restart docker
-      - uses: actions/checkout@v4
-        with:
-          submodules: "true"
-      - name: Log into Docker registry (AWS ECR)
-        run: bash ops/pipeline/login-docker-registry.sh
-      - run: bash ops/pipeline/build-python-wheels-manylinux2014.sh ${{ matrix.arch }}
-
   build-python-wheels-cpu:
     name: Build CPU wheel for ${{ matrix.manylinux_target }}_${{ matrix.arch }}
     runs-on:
diff --git a/ops/pipeline/build-python-wheels-manylinux2014.sh b/ops/pipeline/build-python-wheels-manylinux2014.sh
deleted file mode 100755
index 0130634c3c7e..000000000000
--- a/ops/pipeline/build-python-wheels-manylinux2014.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-# Build Python wheels targeting manylinux2014 (no GPU, no federated learning)
-
-set -euo pipefail
-
-if [[ -z "${GITHUB_SHA:-}" ]]
-then
-  echo "Make sure to set environment variable GITHUB_SHA"
-  exit 1
-fi
-
-if [[ "$#" -lt 1 ]]
-then
-  echo "Usage: $0 {x86_64,aarch64}"
-  exit 1
-fi
-
-arch="$1"
-
-source ops/pipeline/classify-git-branch.sh
-source ops/pipeline/get-docker-registry-details.sh
-source ops/pipeline/get-image-tag.sh
-
-WHEEL_TAG="manylinux2014_${arch}"
-IMAGE_REPO="xgb-ci.${WHEEL_TAG}"
-IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
-PYTHON_BIN="/opt/python/cp310-cp310/bin/python"
-
-echo "--- Build binary wheel for ${WHEEL_TAG}"
-set -x
-
-python3 ops/script/pypi_variants.py --use-cpu-suffix=0 --require-nccl-dep=0
-python3 ops/docker_run.py \
-  --image-uri "${IMAGE_URI}" \
-  -- bash -c \
-  "cd python-package && ${PYTHON_BIN} -m pip wheel --no-deps -v . --wheel-dir dist/"
-
-python3 ops/docker_run.py \
-  --image-uri "${IMAGE_URI}" \
-  -- auditwheel repair --only-plat \
-  --plat ${WHEEL_TAG} python-package/dist/*.whl
-python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
-  wheelhouse/*.whl
-rm -rf python-package/dist/
-mkdir python-package/dist/
-mv -v wheelhouse/*.whl python-package/dist/
-
-if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
-then
-  python3 ops/pipeline/manage-artifacts.py upload \
-    --s3-bucket xgboost-nightly-builds \
-    --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
-    python-package/dist/*.whl
-fi
diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
index c4ff53a38417..aa867ba41f7e 100644
--- a/ops/script/release_artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -145,8 +145,6 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     """Download all Python binary wheels for the specified branch."""
     full_platforms = [
         "win_amd64",
-        "manylinux2014_x86_64",
-        "manylinux2014_aarch64",
         "manylinux_2_28_x86_64",
         "manylinux_2_28_aarch64",
         "macosx_10_15_x86_64",
@@ -155,8 +153,6 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     minimal_platforms = [
         "win_amd64",
         "win_arm64",
-        "manylinux2014_x86_64",
-        "manylinux2014_aarch64",
         "manylinux_2_28_x86_64",
         "manylinux_2_28_aarch64",
     ]

From 6da522912abb7465981bde94e244f4149acb85c6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 11 Sep 2025 02:52:04 +0800
Subject: [PATCH 164/224] [ltr] Improve seed with mean sampling for the first
 iteration. (#11639)

---
 doc/tutorials/learning_to_rank.rst | 6 +++---
 src/objective/lambdarank_obj.h     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index ea5309d31ca0..5dd1ece5213b 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -196,12 +196,12 @@ The learning to rank implementation has been significantly updated in 2.0 with a
 
     params = {
         # 1.7 only supports sampling, while 2.0 and later use top-k as the default.
-	# See above sections for the trade-off.
+        # See above sections for the trade-off.
         "lambdarank_pair_method": "mean",
         # 1.7 uses the ranknet loss while later versions use the NDCG weighted loss
         "objective": "rank:pairwise",
-	# 1.7 doesn't have this normalization.
-	"lambdarank_score_normalization": False,
+        # 1.7 doesn't have this normalization.
+        "lambdarank_score_normalization": False,
         "base_score": 0.5,
         # The default tree method has been changed from approx to hist.
         "tree_method": "approx",
diff --git a/src/objective/lambdarank_obj.h b/src/objective/lambdarank_obj.h
index 56e57582eece..bb8484ce0317 100644
--- a/src/objective/lambdarank_obj.h
+++ b/src/objective/lambdarank_obj.h
@@ -235,7 +235,7 @@ void MakePairs(Context const* ctx, std::int32_t iter,
   } else {
     CHECK_EQ(g_rank.size(), g_label.Size());
 
-    std::uint32_t seed = iter * (static_cast<std::uint32_t>(group_ptr.size()) - 1) + g;
+    std::uint32_t seed = (iter + 1) * (static_cast<std::uint32_t>(group_ptr.size()) - 1) + g;
     std::minstd_rand rnd(seed);
     // sort label according to the rank list
     auto it = common::MakeIndexTransformIter(

From 910c34b971b06861e66d3850714540b0697001e1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 13 Sep 2025 20:00:47 +0800
Subject: [PATCH 165/224] Use updated R doc image. (#11683)

---
 ops/pipeline/build-r-docs.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ops/pipeline/build-r-docs.sh b/ops/pipeline/build-r-docs.sh
index 32af3a60fb43..28477c3e91b4 100755
--- a/ops/pipeline/build-r-docs.sh
+++ b/ops/pipeline/build-r-docs.sh
@@ -10,7 +10,8 @@ fi
 
 source ops/pipeline/get-docker-registry-details.sh
 
-IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.cpu_build_r_doc:main
+source ops/pipeline/get-image-tag.sh
+IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.cpu_build_r_doc:${IMAGE_TAG}
 
 echo "--- Build R package doc"
 set -x

From dcefbdbbeb737e1067eea5f1f17594961e06346e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 17 Sep 2025 04:24:47 +0800
Subject: [PATCH 166/224] [doc] Add R code blocks for the intercept
 introduction. (#11685)

---
 doc/tutorials/advanced_custom_obj.rst |  22 +--
 doc/tutorials/intercept.rst           | 187 ++++++++++++++++++++------
 2 files changed, 154 insertions(+), 55 deletions(-)

diff --git a/doc/tutorials/advanced_custom_obj.rst b/doc/tutorials/advanced_custom_obj.rst
index 540387a1877f..344f3acc89a8 100644
--- a/doc/tutorials/advanced_custom_obj.rst
+++ b/doc/tutorials/advanced_custom_obj.rst
@@ -185,7 +185,7 @@ Convince yourself that the implementation is correct:
         def gen_random_dirichlet(rng: np.random.Generator, m: int, k: int):
             alpha = np.exp(rng.standard_normal(size=k))
             return rng.dirichlet(alpha, size=m)
-        
+
         def test_dirichlet_fun_grad_hess():
             k = 3
             m = 10
@@ -233,12 +233,12 @@ Convince yourself that the implementation is correct:
             alpha <- exp(rnorm(k))
             y <- rdirichlet(m, alpha)
             x0 <- rnorm(k)
-            
+
             for (row in seq_len(m)) {
                 logpdf <- dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
                 ref_logpdf <- ddirichlet(y[row,,drop=F], exp(x0), log = T)
                 expect_equal(logpdf, -ref_logpdf)
-                
+
                 eps <- 1e-7
                 grad_num <- numeric(k)
                 for (col in seq_len(k)) {
@@ -249,10 +249,10 @@ Convince yourself that the implementation is correct:
                         - dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
                     ) / eps
                 }
-                
+
                 grad <- dirichlet.grad(matrix(x0, nrow=1), y[row,,drop=F])
                 expect_equal(grad |> as.vector(), grad_num, tolerance=1e-6)
-                
+
                 H_numeric <- array(dim=c(k, k))
                 for (ii in seq_len(k)) {
                     xplus <- x0
@@ -264,7 +264,7 @@ Convince yourself that the implementation is correct:
                         ) / eps
                     }
                 }
-                
+
                 H <- dirichlet.hess(matrix(xplus, nrow=1), y[row,,drop=F])
                 expect_equal(H[1,,], H_numeric, tolerance=1e-6)
             }
@@ -346,7 +346,7 @@ point, which means it will be a minimum rather than a maximum or saddle point).
             alpha <- exp(x0)
             n.samples <- 5e6
             y.samples <- rdirichlet(n.samples, alpha)
-            
+
             x.broadcast <- rep(x0, n.samples) |> matrix(ncol=k, byrow=T)
             grad.samples <- dirichlet.grad(x.broadcast, y.samples)
             ref <- crossprod(grad.samples) / n.samples
@@ -420,7 +420,7 @@ required for XGBoost's custom objectives:
     .. code-tab:: r R
 
         library(xgboost)
-        
+
         dirichlet.xgb.objective <- function(pred, dtrain) {
             y <- getinfo(dtrain, "label")
             return(
@@ -473,7 +473,7 @@ The data:
 
 .. tabs::
     .. code-tab:: py
-            
+
         # depth
         X = np.array([
             10.4,11.7,12.8,13,15.7,16.3,18,18.7,20.7,22.1,
@@ -508,9 +508,9 @@ Fitting an XGBoost model and making predictions:
 
 .. tabs::
     .. code-tab:: py
-            
+
         from typing import Dict, List
-        
+
         dtrain = xgb.DMatrix(X, label=Y)
         results: Dict[str, Dict[str, List[float]]] = {}
         booster = xgb.train(
diff --git a/doc/tutorials/intercept.rst b/doc/tutorials/intercept.rst
index 122f2313e991..baebd0fee84a 100644
--- a/doc/tutorials/intercept.rst
+++ b/doc/tutorials/intercept.rst
@@ -9,12 +9,28 @@ automatically based on targets upon training. The behavior can be controlled by
 ``base_score`` to a constant value. The following snippet disables the automatic
 estimation:
 
-.. code-block:: python
+.. tabs::
+    .. code-tab:: py
 
-    import xgboost as xgb
+        import xgboost as xgb
 
-    reg = xgb.XGBRegressor()
-    reg.set_params(base_score=0.5)
+        clf = xgb.XGBClassifier(n_estimators=10)
+        clf.set_params(base_score=0.5)
+
+    .. code-tab:: r R
+
+        library(xgboost)
+
+        # Load built-in dataset
+        data(agaricus.train, package = "xgboost")
+
+        # Set base_score parameter directly
+        model <- xgboost(
+          x = agaricus.train$data,
+          y = factor(agaricus.train$label),
+          base_score = 0.5,
+          nrounds = 10
+        )
 
 In addition, here 0.5 represents the value after applying the inverse link function. See
 the end of the document for a description.
@@ -24,22 +40,53 @@ Other than the ``base_score``, users can also provide global bias via the data f
 and multi-class, the ``base_margin`` is a matrix with size ``(n_samples, n_targets)`` or
 ``(n_samples, n_classes)``.
 
-.. code-block:: python
+.. tabs::
+    .. code-tab:: py
+
+        import xgboost as xgb
+        from sklearn.datasets import make_classification
+
+        X, y = make_classification()
+
+        clf = xgb.XGBClassifier()
+        clf.fit(X, y)
+        # Request for raw prediction
+        m = clf.predict(X, output_margin=True)
+
+        clf_1 = xgb.XGBClassifier()
+        # Feed the prediction into the next model
+        # Using base margin overrides the base score, see below sections.
+        clf_1.fit(X, y, base_margin=m)
+        clf_1.predict(X, base_margin=m)
+
+    .. code-tab:: r R
+
+        library(xgboost)
+
+        # Load built-in dataset
+        data(agaricus.train, package = "xgboost")
 
-    import xgboost as xgb
-    from sklearn.datasets import make_regression
+        # Train first model
+        model_1 <- xgboost(
+          x = agaricus.train$data,
+          y = factor(agaricus.train$label),
+          nrounds = 10
+        )
 
-    X, y = make_regression()
+        # Request for raw prediction
+        m <- predict(model_1, agaricus.train$data, type = "raw")
 
-    reg = xgb.XGBRegressor()
-    reg.fit(X, y)
-    # Request for raw prediction
-    m = reg.predict(X, output_margin=True)
+        # Feed the prediction into the next model using base_margin
+        # Using base margin overrides the base score, see below sections.
+        model_2 <- xgboost(
+          x = agaricus.train$data,
+          y = factor(agaricus.train$label),
+          base_margin = m,
+          nrounds = 10
+        )
 
-    reg_1 = xgb.XGBRegressor()
-    # Feed the prediction into the next model
-    reg_1.fit(X, y, base_margin=m)
-    reg_1.predict(X, base_margin=m)
+        # Make predictions with base_margin
+        pred <- predict(model_2, agaricus.train$data, base_margin = m)
 
 
 It specifies the bias for each sample and can be used for stacking an XGBoost model on top
@@ -145,49 +192,101 @@ Example
 The following example shows the relationship between ``base_score`` and ``base_margin``
 using binary logistic with a `logit` link function:
 
-.. code-block:: python
+.. tabs::
+    .. code-tab:: py
 
-    import numpy as np
-    from scipy.special import logit
-    from sklearn.datasets import make_classification
-    from xgboost import train, DMatrix
+        import numpy as np
+        from scipy.special import logit
+        from sklearn.datasets import make_classification
 
-    X, y = make_classification(random_state=2025)
+        import xgboost as xgb
+
+        X, y = make_classification(random_state=2025)
+
+    .. code-tab:: r R
+
+        library(xgboost)
+
+        # Load built-in dataset
+        data(agaricus.train, package = "xgboost")
+        X <- agaricus.train$data
+        y <- agaricus.train$label
 
 The intercept is a valid probability (0.5). It's used as the initial estimation of the
 probability of obtaining a positive sample.
 
-.. code-block:: python
+.. tabs::
+    .. code-tab:: py
+
+        intercept = 0.5
+
+    .. code-tab:: r R
 
-    intercept = 0.5
+        intercept <- 0.5
 
 First we use the intercept to train a model:
 
-.. code-block:: python
+.. tabs::
+    .. code-tab:: py
 
-    booster = train(
-        {"base_score": intercept, "objective": "binary:logistic"},
-        dtrain=DMatrix(X, y),
-        num_boost_round=1,
-    )
-    predt_0 = booster.predict(DMatrix(X, y))
+        booster = xgb.train(
+            {"base_score": intercept, "objective": "binary:logistic"},
+            dtrain=xgb.DMatrix(X, y),
+            num_boost_round=1,
+        )
+        predt_0 = booster.predict(xgb.DMatrix(X, y))
 
-Apply :py:func:`~scipy.special.logit` to obtain the "margin":
+    .. code-tab:: r R
+
+        # First model with base_score
+        model_0 <- xgboost(
+          x = X, y = factor(y),
+          base_score = intercept,
+          objective = "binary:logistic",
+          nrounds = 1
+        )
+        predt_0 <- predict(model_0, X)
 
-.. code-block:: python
+Apply :py:func:`~scipy.special.logit` to obtain the "margin":
 
-    margin = np.full(y.shape, fill_value=logit(intercept), dtype=np.float32)
-    Xy = DMatrix(X, y, base_margin=margin)
-    # 0.2 is a dummy value to show that `base_margin` overrides `base_score`.
-    booster = train(
-        {"base_score": 0.2, "objective": "binary:logistic"},
-        dtrain=Xy,
-        num_boost_round=1,
-    )
-    predt_1 = booster.predict(Xy)
+.. tabs::
+    .. code-tab:: py
+
+        # Apply logit function to obtain the "margin"
+        margin = np.full(y.shape, fill_value=logit(intercept), dtype=np.float32)
+        Xy = xgb.DMatrix(X, y, base_margin=margin)
+        # Second model with base_margin
+        # 0.2 is a dummy value to show that `base_margin` overrides `base_score`.
+        booster = xgb.train(
+            {"base_score": 0.2, "objective": "binary:logistic"},
+            dtrain=Xy,
+            num_boost_round=1,
+        )
+        predt_1 = booster.predict(Xy)
+
+    .. code-tab:: r R
+
+        # Apply logit function to obtain the "margin"
+        logit_intercept <- log(intercept / (1 - intercept))
+        margin <- rep(logit_intercept, length(y))
+        # Second model with base_margin
+        # 0.2 is a dummy value to show that `base_margin` overrides `base_score`
+        model_1 <- xgboost(
+          x = X, y = factor(y),
+          base_margin = margin,
+          base_score = 0.2,
+          objective = "binary:logistic",
+          nrounds = 1
+        )
+        predt_1 <- predict(model_1, X, base_margin = margin)
 
 Compare the results:
 
-.. code-block:: python
+.. tabs::
+    .. code-tab:: py
+
+        np.testing.assert_allclose(predt_0, predt_1)
+
+    .. code-tab:: r R
 
-    np.testing.assert_allclose(predt_0, predt_1)
+        all.equal(predt_0, predt_1, tolerance = 1e-6)

From cef93215f40b29898ff3faf052e5a99a25ca5216 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Sat, 20 Sep 2025 06:52:56 -0500
Subject: [PATCH 167/224] Update categorical docs to remove experimental label
 (#11690)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 doc/tutorials/categorical.rst | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index feb9a90d3ef6..c3c7d173078b 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -2,23 +2,15 @@
 Categorical Data
 ################
 
-.. note::
+Since version 1.5, XGBoost has support for categorical data.  For numerical data, the
+split condition is defined as :math:`value < threshold`, while for categorical data the
+split is defined depending on whether partitioning or onehot encoding is used. For
+partition-based splits, the splits are specified as :math:`value \in categories`, where
+``categories`` is the set of categories in one feature.  If onehot encoding is used
+instead, then the split is defined as :math:`value == category`. More advanced categorical
+split strategy is planned for future releases and this tutorial details how to inform
+XGBoost about the data type.
 
-   As of XGBoost 1.6, the feature is experimental and has limited features. Only the
-   Python package is fully supported.
-
-.. versionadded:: 3.0
-
-   Support for the R package using ``factor``.
-
-Starting from version 1.5, the XGBoost Python package has experimental support for
-categorical data available for public testing. For numerical data, the split condition is
-defined as :math:`value < threshold`, while for categorical data the split is defined
-depending on whether partitioning or onehot encoding is used. For partition-based splits,
-the splits are specified as :math:`value \in categories`, where ``categories`` is the set
-of categories in one feature.  If onehot encoding is used instead, then the split is
-defined as :math:`value == category`. More advanced categorical split strategy is planned
-for future releases and this tutorial details how to inform XGBoost about the data type.
 
 ************************************
 Training with scikit-learn Interface
@@ -69,6 +61,9 @@ for a worked example of using categorical data with ``scikit-learn`` interface w
 one-hot encoding.  A comparison between using one-hot encoded data and XGBoost's
 categorical data support can be found :ref:`sphx_glr_python_examples_cat_in_the_dat.py`.
 
+.. versionadded:: 3.0
+
+   Support for the R package using ``factor``.
 
 ********************
 Optimal Partitioning

From 2417e5bece34435e2b0ea8f774e403ae0ad133f5 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sat, 20 Sep 2025 17:45:02 -0700
Subject: [PATCH 168/224] [CI] Build variant wheels for CUDA 12 and 13 (#11677)

* [CI] Don't set USE_DLOPEN_NCCL for CUDA 13 wheels

* Update pypi_variants.py to accommodate two variants of nvidia-nccl

* Update pypi_variants to accommodate cu12, cu13 suffixes

* nvidia-nccl-cu13 is now available on PyPI

* Fix formatting

* Upload CUDA 13 wheel to S3

* Create stub package xgboost-cu12

* Update scripts

* Update release_artifacts.py
---
 .github/workflows/python_wheels_winarm64.yml |  2 +-
 ops/pipeline/build-cuda-impl.sh              |  4 +-
 ops/pipeline/build-cuda13.sh                 | 13 +++-
 ops/pipeline/build-python-wheels-arm64.sh    |  2 +-
 ops/pipeline/build-python-wheels-cpu.sh      |  2 +-
 ops/pipeline/build-variant-wheels.sh         |  5 +-
 ops/pipeline/build-win64-cpu.ps1             |  2 +-
 ops/script/change_version.py                 | 21 +++---
 ops/script/pypi_variants.py                  | 78 +++++++++++++++-----
 ops/script/release_artifacts.py              | 20 ++++-
 python-package/README.stub.rst               |  5 ++
 python-package/pyproject.toml.stub.in        | 38 ++++++++++
 12 files changed, 152 insertions(+), 40 deletions(-)
 create mode 100644 python-package/README.stub.rst
 create mode 100644 python-package/pyproject.toml.stub.in

diff --git a/.github/workflows/python_wheels_winarm64.yml b/.github/workflows/python_wheels_winarm64.yml
index aaeaac08e274..edccf52ff333 100644
--- a/.github/workflows/python_wheels_winarm64.yml
+++ b/.github/workflows/python_wheels_winarm64.yml
@@ -46,7 +46,7 @@ jobs:
       - name: Build XGBoost Python wheel for Win-ARM64
         run: |
           # Patch to rename pkg to xgboost-cpu
-          python ops/script/pypi_variants.py --use-cpu-suffix=1 --require-nccl-dep=0
+          python ops/script/pypi_variants.py --use-suffix=cpu --require-nccl-dep=na
           cd python-package
           mkdir -p wheelhouse
           pip wheel --no-deps -v . --wheel-dir wheelhouse/
diff --git a/ops/pipeline/build-cuda-impl.sh b/ops/pipeline/build-cuda-impl.sh
index d25b3c729dd8..1ec9550b7ed3 100755
--- a/ops/pipeline/build-cuda-impl.sh
+++ b/ops/pipeline/build-cuda-impl.sh
@@ -11,14 +11,14 @@ else
   cmake_args=''
 fi
 
-if [[ "${USE_FEDERATED:-}" == 1 ]]
+if [[ "${USE_FEDERATED:-0}" == 1 ]]
 then
   cmake_args="${cmake_args} -DPLUGIN_FEDERATED=ON"
 else
   cmake_args="${cmake_args} -DPLUGIN_FEDERATED=OFF"
 fi
 
-if [[ "${USE_RMM:-}" == 1 ]]
+if [[ "${USE_RMM:-0}" == 1 ]]
 then
   cmake_prefix_path='/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake'
   cmake_args="${cmake_args} -DPLUGIN_RMM=ON"
diff --git a/ops/pipeline/build-cuda13.sh b/ops/pipeline/build-cuda13.sh
index a12c4d019a1a..8e24e8147b70 100755
--- a/ops/pipeline/build-cuda13.sh
+++ b/ops/pipeline/build-cuda13.sh
@@ -32,9 +32,7 @@ fi
 
 set -x
 
-# Remove nvidia-nccl-cu12 from the list of Python deps
-# nvidia-nccl-cu13 is not yet available on PyPI
-python3 ops/script/pypi_variants.py --use-cpu-suffix=0 --require-nccl-dep=0
+python3 ops/script/pypi_variants.py --use-suffix=cu13 --require-nccl-dep=cu13
 
 python3 ops/docker_run.py \
   --image-uri ${BUILD_IMAGE_URI} \
@@ -56,3 +54,12 @@ fi
 
 # Check size of wheel
 pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
+
+echo "--- Upload Python wheel"
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  python3 ops/pipeline/manage-artifacts.py upload \
+    --s3-bucket xgboost-nightly-builds \
+    --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
+    python-package/dist/*.whl
+fi
diff --git a/ops/pipeline/build-python-wheels-arm64.sh b/ops/pipeline/build-python-wheels-arm64.sh
index 24e102aa97c9..ff38ceee13de 100755
--- a/ops/pipeline/build-python-wheels-arm64.sh
+++ b/ops/pipeline/build-python-wheels-arm64.sh
@@ -19,7 +19,7 @@ IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}
 echo "--- Build CPU code targeting ARM64"
 set -x
 
-python3 ops/script/pypi_variants.py --use-cpu-suffix=0 --require-nccl-dep=0
+python3 ops/script/pypi_variants.py --use-suffix=na --require-nccl-dep=na
 python3 ops/docker_run.py \
   --image-uri ${IMAGE_URI} \
   -- ops/pipeline/build-python-wheels-arm64-impl.sh
diff --git a/ops/pipeline/build-python-wheels-cpu.sh b/ops/pipeline/build-python-wheels-cpu.sh
index deab3dce422c..6f1418996749 100644
--- a/ops/pipeline/build-python-wheels-cpu.sh
+++ b/ops/pipeline/build-python-wheels-cpu.sh
@@ -31,7 +31,7 @@ echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)"
 set -x
 
 # Patch to rename pkg to xgboost-cpu
-python3 ops/script/pypi_variants.py --use-cpu-suffix=1 --require-nccl-dep=0
+python3 ops/script/pypi_variants.py --use-suffix=cpu --require-nccl-dep=na
 python3 ops/docker_run.py \
   --image-uri "${IMAGE_URI}" \
   -- bash -c \
diff --git a/ops/pipeline/build-variant-wheels.sh b/ops/pipeline/build-variant-wheels.sh
index d018e4609cf8..25aaa048b2e0 100755
--- a/ops/pipeline/build-variant-wheels.sh
+++ b/ops/pipeline/build-variant-wheels.sh
@@ -10,6 +10,8 @@ then
 fi
 
 image_repo='xgb-ci.gpu_build_rockylinux8'
+export USE_RMM=0
+export USE_FEDERATED=0
 
 source ops/pipeline/classify-git-branch.sh
 source ops/pipeline/get-docker-registry-details.sh
@@ -27,13 +29,12 @@ then
 else
   export BUILD_ONLY_SM75=0
 fi
-export USE_RMM=0
 
 set -x
 
 python3 ops/docker_run.py \
   --image-uri ${BUILD_IMAGE_URI} \
-  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM' \
+  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \
   -- ops/pipeline/build-cuda-impl.sh
 
 echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
diff --git a/ops/pipeline/build-win64-cpu.ps1 b/ops/pipeline/build-win64-cpu.ps1
index 99c01460acdc..52583358e98a 100644
--- a/ops/pipeline/build-win64-cpu.ps1
+++ b/ops/pipeline/build-win64-cpu.ps1
@@ -18,7 +18,7 @@ Write-Host "--- Build binary wheel"
 cd ..
 # Patch to rename pkg to xgboost-cpu
 conda activate
-python ops/script/pypi_variants.py --use-cpu-suffix=1 --require-nccl-dep=0
+python ops/script/pypi_variants.py --use-suffix=cpu --require-nccl-dep=na
 if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
 cd python-package
diff --git a/ops/script/change_version.py b/ops/script/change_version.py
index 7e8bac4fc0fc..82752a05e5f7 100644
--- a/ops/script/change_version.py
+++ b/ops/script/change_version.py
@@ -52,16 +52,17 @@ def pypkg(
     with open(pyver_path, "w") as fd:
         fd.write(pyver + "\n")
 
-    pyprj_path = os.path.join("pyproject.toml.in")
-    with open(pyprj_path, "r") as fd:
-        pyprj = fd.read()
-    matched = re.search('version = "' + r"([0-9]+\.[0-9]+\.[0-9]+.*)" + '"', pyprj)
-    assert matched, "Couldn't find version string in pyproject.toml."
-    pyprj = pyprj[: matched.start(1)] + pyver + pyprj[matched.end(1) :]
-    with open(pyprj_path, "w") as fd:
-        fd.write(pyprj)
-
-    make_pyproject(use_cpu_suffix=0, require_nccl_dep=1)
+    for pyprj_file in ["pyproject.toml.in", "pyproject.toml.stub.in"]:
+        pyprj_path = os.path.join(pyprj_file)
+        with open(pyprj_path, "r") as fd:
+            pyprj = fd.read()
+        matched = re.search('version = "' + r"([0-9]+\.[0-9]+\.[0-9]+.*)" + '"', pyprj)
+        assert matched, "Couldn't find version string in pyproject.toml."
+        pyprj = pyprj[: matched.start(1)] + pyver + pyprj[matched.end(1) :]
+        with open(pyprj_path, "w") as fd:
+            fd.write(pyprj)
+
+    make_pyproject(use_suffix="na", require_nccl_dep="cu12")
 
 
 @cd(R_PACKAGE)
diff --git a/ops/script/pypi_variants.py b/ops/script/pypi_variants.py
index f815541a1606..c18b11a472c4 100644
--- a/ops/script/pypi_variants.py
+++ b/ops/script/pypi_variants.py
@@ -2,16 +2,21 @@
 
 import argparse
 import os
+import tomllib
 
+from packaging.version import Version
 from test_utils import PY_PACKAGE
 
 IN_PATH = os.path.join(PY_PACKAGE, "pyproject.toml.in")
+STUB_IN_PATH = os.path.join(PY_PACKAGE, "pyproject.toml.stub.in")
 OUT_PATH = os.path.join(PY_PACKAGE, "pyproject.toml")
 
-NCCL_WHL = """    \"nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'\","""
+NCCL_WHL = """    \"nvidia-nccl-{0} ; platform_system == 'Linux' and platform_machine != 'aarch64'\","""
 
 NAME = "{{ name }}"
 NCCL = "{{ nccl }}"
+VERSION = "{{ version }}"
+CUDA_VARIANTS = ["cu12", "cu13"]
 
 
 def copyfile(src: str, dst: str) -> None:
@@ -21,22 +26,55 @@ def copyfile(src: str, dst: str) -> None:
         fd.write(content)
 
 
-def make_pyproject(*, use_cpu_suffix: int, require_nccl_dep: int) -> None:
-    if use_cpu_suffix == 1 and require_nccl_dep == 1:
+def make_pyproject(
+    *, use_suffix: str, require_nccl_dep: str, create_stub: bool = False
+) -> None:
+    if use_suffix == "cpu" and require_nccl_dep != "na":
         raise ValueError(
             "xgboost-cpu cannot require NCCL dependency. "
-            "If --use-cpu-suffix=1, you must set --require-nccl-dep=0."
+            "When setting --use-suffix='cpu', you must also set --require-nccl-dep='na'."
         )
+    if (
+        use_suffix in CUDA_VARIANTS
+        and require_nccl_dep in CUDA_VARIANTS
+        and use_suffix != require_nccl_dep
+    ):
+        raise ValueError(
+            "Inconsistent choices for --use-suffix and --require-nccl-dep. "
+            "When --use-suffix is set to one of {{{0}}}, --require-nccl-dep must be "
+            "set to identical value as --use-suffix.".format(",".join(CUDA_VARIANTS))
+        )
+    if create_stub:
+        if use_suffix == "na":
+            raise ValueError("To create a stub package, --use-suffix must not be 'na'")
+        if require_nccl_dep != "na":
+            raise ValueError(
+                "To create a stub package, --require-nccl-dep must be 'na'"
+            )
 
-    with open(IN_PATH) as fd:
+    with open(STUB_IN_PATH if create_stub else IN_PATH) as fd:
         pyproject = fd.read()
 
     readme_dft = os.path.join(PY_PACKAGE, "README.dft.rst")
     readme_cpu = os.path.join(PY_PACKAGE, "README.cpu.rst")
+    readme_stub = os.path.join(PY_PACKAGE, "README.stub.rst")
     readme = os.path.join(PY_PACKAGE, "README.rst")
-    pyproject = pyproject.replace(NAME, "xgboost-cpu" if use_cpu_suffix else "xgboost")
-    copyfile(readme_cpu if use_cpu_suffix else readme_dft, readme)
-    pyproject = pyproject.replace(NCCL, NCCL_WHL if require_nccl_dep else "")
+    pyproject = pyproject.replace(
+        NAME, f"xgboost-{use_suffix}" if use_suffix != "na" else "xgboost"
+    )
+    if create_stub:
+        copyfile(readme_stub, readme)
+        pyproject_parsed = tomllib.loads(pyproject)
+        pyproject = pyproject.replace(
+            VERSION, str(Version(pyproject_parsed["project"]["version"]))
+        )
+    elif use_suffix == "cpu":
+        copyfile(readme_cpu, readme)
+    else:
+        copyfile(readme_dft, readme)
+    pyproject = pyproject.replace(
+        NCCL, NCCL_WHL.format(require_nccl_dep) if require_nccl_dep != "na" else ""
+    )
     pyproject = (
         f"# Generated by `{os.path.basename(__file__)}`, don't edit.\n" + pyproject
     )
@@ -48,21 +86,27 @@ def make_pyproject(*, use_cpu_suffix: int, require_nccl_dep: int) -> None:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--use-cpu-suffix",
-        type=int,
-        choices=[0, 1],
-        required=True,
-        help="Whether to rename the package name to xgboost-cpu",
+        "--use-suffix",
+        type=str,
+        choices=["na", "cpu"] + CUDA_VARIANTS,
+        default="na",
+        help="When using this option, rename the package name to xgboost-[suffix]. Set to 'na' to disable",
     )
     parser.add_argument(
         "--require-nccl-dep",
-        type=int,
-        choices=[0, 1],
+        type=str,
+        choices=["na"] + CUDA_VARIANTS,
         required=True,
-        help="Whether to require the NCCL dependency",
+        help="Which NCCL dependency to use; select 'na' to remove NCCL dependency",
+    )
+    parser.add_argument(
+        "--create-stub",
+        action="https://github.com/store_true",
+        help="Create a stub package that redirects users to install `xgboost`",
     )
     args = parser.parse_args()
     make_pyproject(
-        use_cpu_suffix=args.use_cpu_suffix,
+        use_suffix=args.use_suffix,
         require_nccl_dep=args.require_nccl_dep,
+        create_stub=args.create_stub,
     )
diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
index aa867ba41f7e..0dfeff7924bc 100644
--- a/ops/script/release_artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -111,7 +111,7 @@ def make_python_sdist(
 
     # Build sdist for `xgboost-cpu`.
     with DirectoryExcursion(ROOT):
-        make_pyproject(use_cpu_suffix=1, require_nccl_dep=0)
+        make_pyproject(use_suffix="cpu", require_nccl_dep="na")
     with DirectoryExcursion(ROOT / "python-package"):
         subprocess.run(["python", "-m", "build", "--sdist"], check=True)
         sdist_name = (
@@ -126,7 +126,7 @@ def make_python_sdist(
 
     # Build sdist for `xgboost`.
     with DirectoryExcursion(ROOT):
-        make_pyproject(use_cpu_suffix=0, require_nccl_dep=1)
+        make_pyproject(use_suffix="na", require_nccl_dep="cu12")
 
     with DirectoryExcursion(ROOT / "python-package"):
         subprocess.run(["python", "-m", "build", "--sdist"], check=True)
@@ -140,6 +140,22 @@ def make_python_sdist(
         dest = dist_dir / sdist_name
         shutil.move(src, dest)
 
+    # Build stub package `xgboost-cu12`.
+    with DirectoryExcursion(ROOT):
+        make_pyproject(use_suffix="cu12", require_nccl_dep="na", create_stub=True)
+
+    with DirectoryExcursion(ROOT / "python-package"):
+        subprocess.run(["python", "-m", "build", "--sdist"], check=True)
+        sdist_name = (
+            f"xgboost_cu12-{release}{rc}{rc_ver}.tar.gz"
+            if rc
+            else f"xgboost_cu12-{release}.tar.gz"
+        )
+        src = DIST / sdist_name
+        subprocess.run(["twine", "check", str(src)], check=True)
+        dest = dist_dir / sdist_name
+        shutil.move(src, dest)
+
 
 def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     """Download all Python binary wheels for the specified branch."""
diff --git a/python-package/README.stub.rst b/python-package/README.stub.rst
new file mode 100644
index 000000000000..8ffbbce80dd8
--- /dev/null
+++ b/python-package/README.stub.rst
@@ -0,0 +1,5 @@
+======================================
+Placeholder for XGBoost Python Package
+======================================
+
+This package is a placeholder for the `xgboost` package.
diff --git a/python-package/pyproject.toml.stub.in b/python-package/pyproject.toml.stub.in
new file mode 100644
index 000000000000..23945b5787ec
--- /dev/null
+++ b/python-package/pyproject.toml.stub.in
@@ -0,0 +1,38 @@
+[build-system]
+requires = [
+    "hatchling>=1.12.1",
+]
+build-backend = "hatchling.build"
+
+[project]
+name = "{{ name }}"
+description = "XGBoost Python Package"
+readme = { file = "README.rst", content-type = "text/x-rst" }
+authors = [
+    { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
+    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
+]
+version = "3.1.0-dev"
+requires-python = ">=3.10"
+license = { text = "Apache-2.0" }
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Development Status :: 5 - Production/Stable",
+    "Operating System :: OS Independent",
+    "Typing :: Typed",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "xgboost=={{ version }}",
+]
+
+[tool.hatch.build.targets.sdist]
+only-include = ["pyproject.toml"]
+
+[tool.hatch.build.targets.wheel]
+only-include = ["pyproject.toml"]

From 111848f2898487ed546720f643c8c531f17a8714 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 22 Sep 2025 15:46:43 -0700
Subject: [PATCH 169/224] Remove outdated link to BuildKite (#11693)

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 67293764ec0d..f413cf3c97f7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 <img src="https://xgboost.ai/images/logo/xgboost-logo-trimmed.png" width=200/> eXtreme Gradient Boosting
 ===========
 
-[![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci)
 [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost%20CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
 [![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE)

From 07ecf3da3852cce692d75cd83176118c96601b23 Mon Sep 17 00:00:00 2001
From: Jonathan DEKHTIAR <jonathan@dekhtiar.com>
Date: Tue, 23 Sep 2025 20:14:11 -0400
Subject: [PATCH 170/224] Fix Variant Build (#11695)

---
 ops/pipeline/build-variant-wheels-impl.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ops/pipeline/build-variant-wheels-impl.sh b/ops/pipeline/build-variant-wheels-impl.sh
index 5535674ec685..483b157f42fa 100755
--- a/ops/pipeline/build-variant-wheels-impl.sh
+++ b/ops/pipeline/build-variant-wheels-impl.sh
@@ -16,9 +16,8 @@ source activate wheelnext
 # Cannot set -u before Conda env activation
 set -xu
 
-python -m pip install -v \
-  git+https://github.com/wheelnext/pep_xxx_wheel_variants.git@25ea4b6d0060d2263d8ec674dd96feffbae78081#subdirectory=pep_xxx_wheel_variants
-python -m pip install "nvidia-variant-provider @ git+https://github.com/wheelnext/nvidia-variant-provider.git@efc215a95f211276587c8a63617dc6dca5f19363"
+python -m pip install "variantlib[cli] @ git+https://github.com/wheelnext/variantlib.git@main"
+python -m pip install "nvidia-variant-provider @ git+https://github.com/wheelnext/nvidia-variant-provider.git@master"
 variantlib make-variant --no-isolation -f python-package/dist/xgboost-*.whl \
   -p "nvidia :: cuda_version_lower_bound :: 12.0" \
   -p "nvidia :: cuda_version_upper_bound :: 12.9" \

From f5a2ff7dcab46bd1fefee8e8de53206e56b6e751 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 25 Sep 2025 19:27:29 +0800
Subject: [PATCH 171/224] 3.1.0 news (#11694)

---
 doc/changes/index.rst  |   1 +
 doc/changes/v3.1.0.rst | 163 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+)
 create mode 100644 doc/changes/v3.1.0.rst

diff --git a/doc/changes/index.rst b/doc/changes/index.rst
index c1e155ca0421..d376c490853f 100644
--- a/doc/changes/index.rst
+++ b/doc/changes/index.rst
@@ -8,5 +8,6 @@ For release notes prior to the 2.1 release, please see `news <https://github.com
   :maxdepth: 1
   :caption: Contents:
 
+  v3.1.0
   v3.0.0
   v2.1.0
\ No newline at end of file
diff --git a/doc/changes/v3.1.0.rst b/doc/changes/v3.1.0.rst
new file mode 100644
index 000000000000..6db6eef4c3cf
--- /dev/null
+++ b/doc/changes/v3.1.0.rst
@@ -0,0 +1,163 @@
+###################
+3.1.0 (2025 Sep 22)
+###################
+
+We are delighted to share the latest 3.1.0 update for XGBoost.
+
+********************
+Categorical Re-coder
+********************
+
+This release features a major update to categorical data support by introducing a
+re-coder. This re-coder saves categories in the trained model and re-codes the data during
+inference, to keep the categorical encoding consistent. Aside from primitive types like
+integers, it also supports string-based categories. The implementation works with all
+supported Python DataFrame implementations. (:pr:`11609`, :pr:`11665`, :pr:`11605`,
+:pr:`11628`, :pr:`11598`, :pr:`11591`, :pr:`11568`, :pr:`11561`, :pr:`11650`, :pr:`11621`,
+:pr:`11611`, :pr:`11313`, :pr:`11311`, :pr:`11310`, :pr:`11315`, :pr:`11303`, :pr:`11612`,
+:pr:`11098`, :pr:`11347`) See :ref:`cat-recode` for more information. (:pr:`11297`)
+
+In addition, categorical support for Polars data frames is now available (:pr:`11565`).
+
+Lastly, we removed the experimental tag for categorical feature support in this
+release. (:pr:`11690`)
+
+***************
+External Memory
+***************
+
+We continue the work on external memory support on 3.1. In this release, XGBoost features
+an adaptive cache for CUDA external memory. The improved cache can split the data between
+CPU memory and GPU memory according to the underlying hardware and data
+size. (:pr:`11556`, :pr:`11465`, :pr:`11664`, :pr:`11594`, :pr:`11469`, :pr:`11547`,
+:pr:`11339`, :pr:`11477`, :pr:`11453`, :pr:`11446`, :pr:`11458`, :pr:`11426`, :pr:`11566`,
+:pr:`11497`)
+
+Also, there's an optional support (opt-in) for using ``nvcomp`` and the GB200
+decompression engine to handle sparse data (requires nvcomp as a plugin) (:pr:`11451`,
+:pr:`11464`, :pr:`11460`, :pr:`11512`, :pr:`11520`). We improved the memory usage of
+quantile sketching with external memory (:pr:`11641`) and optimized the predictor for
+training (:pr:`11548`). To help ensure the training performance, the latest XGBoost
+features detection for NUMA (Non-Uniform Memory Access) node (:pr:`11538`, :pr:`11576`) for checking cross-socket data
+access. We are working on additional tooling to enhance NUMA node performance. Aside from
+features, we have also added various documentation improvements. (:pr:`11412`,
+:pr:`11631`)
+
+Lastly, external memory support with text file input has been removed
+(:pr:`11562`). Moving forward, we will focus on iterator inputs.
+
+
+****************************
+Multi-Target/Class Intercept
+****************************
+
+Starting with 3.1, the base-score (intercept) is estimated and stored as a vector when the
+model has multiple outputs, be it multi-target regression or multi-class
+classification. This change enhances the initial estimation for multi-output models and
+will be the starting point for future work on vector-leaf. (:pr:`11277`, :pr:`11651`,
+:pr:`11625`, :pr:`11649`, :pr:`11630`, :pr:`11647`, :pr:`11656`, :pr:`11663`)
+
+********
+Features
+********
+
+- Support leaf prediction with QDM on CPU. (:pr:`11620`)
+- Improve seed with mean sampling for the first iteration. (:pr:`11639`)
+- Optionally include git hash in CMake build. (:pr:`11587`)
+
+****************************
+Removing Deprecated Features
+****************************
+
+This version removes some deprecated features, notably, the binary IO format, along with
+features deprecated in 2.0.
+
+- Binary serialization format has been removed in 3.1. The format has been formally
+  deprecated in `1.6 <https://github.com/dmlc/xgboost/issues/7547>`__. (:pr:`11307`,
+  :pr:`11553`, :pr:`11552`, :pr:`11602`)
+
+- Removed old GPU-related parameters including ``use_gpu`` (pyspark), ``gpu_id``,
+  ``gpu_hist``, and ``gpu_coord_descent``. These parameters have been deprecated in
+  2.0. Use the ``device`` parameter instead. (:pr:`11395`, :pr:`11554`, :pr:`11549`,
+  :pr:`11543`, :pr:`11539`, :pr:`11402`)
+
+- Remove deprecated C functions: ``XGDMatrixCreateFromCSREx``,
+  ``XGDMatrixCreateFromCSCEx``. (:pr:`11514`, :pr:`11513`)
+
+- XGBoost starts emit warning for text inputs. (:pr:`11590`)
+
+
+*************
+Optimizations
+*************
+
+- Optimize CPU inference with Array-Based Tree Traversal (:pr:`11519`)
+- Specialize for GPU dense histogram. (:pr:`11443`)
+- [sycl] Improve L1 cache locality for histogram building. (:pr:`11555`)
+- [sycl] Reduce predictor memory consumption and improve L2 locality (:pr:`11603`)
+
+*****
+Fixes
+*****
+
+- Fix static linking C++ libraries on macOS (:pr:`11522`)
+- Rename param.hh/cc to hist_param.hh/cc to fix xcode build (:pr:`11378`)
+- [sycl] Fix build with updated compiler (:pr:`11618`)
+- [sycl] Various fixes for fp32-only devices. (:pr:`11527`, :pr:`11524`)
+- Fix compilation on android older than API 26 (:pr:`11366`)
+- Fix loading Gamma model from 1.3. (:pr:`11377`)
+
+**************
+Python Package
+**************
+
+- Support mixing Python metrics and built-in metrics for the skl interface. (:pr:`11536`)
+- CUDA 13 Support for PyPI with the new ``xgboost-cu13`` package. (:pr:`11677`, :pr:`11662`)
+- Remove wheels for manylinux2014. (:pr:`11673`)
+- Initial support for building variant wheels (:pr:`11531`, :pr:`11645`, :pr:`11294`)
+- Minimum PySpark version is now set to 3.4 (:pr:`11364`). In addition, the PySpark
+  interface now checks the validation indicator column type and has a fix for None column
+  input. (:pr:`11535`, :pr:`11523`)
+- [dask] Small cleanup for the predict function. (:pr:`11423`)
+
+*********
+R Package
+*********
+
+Now that most of the deprecated features have been removed in this release, we will try to
+bring the latest R package back to CRAN.
+
+- Implement Booster reset. (:pr:`11357`)
+- Improvements for documentation, including having code examples in XGBoost's sphinx
+  documentation side, and notes for R-universe release. (:pr:`11369`, :pr:`11410`,
+  :pr:`11685`, :pr:`11316`)
+
+************
+JVM Packages
+************
+
+- Support columnar inputs for cpu pipeline (:pr:`11352`)
+- Rewrite the `LabeledPoint` as a Java class (:pr:`11545`)
+- Various fixes and document updates. (:pr:`11525`, :pr:`11508`, :pr:`11489`, :pr:`11682`)
+
+*********
+Documents
+*********
+
+Changes for general documentation:
+
+- Update notes about GPU memory usage. (:pr:`11375`)
+- Various fixes and updates. (:pr:`11503`, :pr:`11532`, :pr:`11328`, :pr:`11344`, :pr:`11626`)
+
+
+******************
+CI and Maintenance
+******************
+
+- Code cleanups. (:pr:`11367`, :pr:`11342`, :pr:`11658`, :pr:`11528`, :pr:`11585`,
+  :pr:`11672`, :pr:`11642`, :pr:`11667`, :pr:`11495`, :pr:`11567`)
+- Various cleanup and fixes for tests. (:pr:`11405`, :pr:`11389`, :pr:`11396`, :pr:`11456`)
+- Support CMake 4.0 (:pr:`11382`)
+- Various CI updates and fixes (:pr:`11318`, :pr:`11349`, :pr:`11653`, :pr:`11637`,
+  :pr:`11683`, :pr:`11638`, :pr:`11644`, :pr:`11306`, :pr:`11560`, :pr:`11323`, :pr:`11617`,
+  :pr:`11341`, :pr:`11693`)

From e4dc7b6e1a90e3eb31895ffb9c5b309c0d1a10a1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 26 Sep 2025 06:36:41 +0800
Subject: [PATCH 172/224] Fix pyproject toml template. (#11699)

---
 python-package/pyproject.toml.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/pyproject.toml.in b/python-package/pyproject.toml.in
index 01dff2462e7a..5cb7a8ba478e 100644
--- a/python-package/pyproject.toml.in
+++ b/python-package/pyproject.toml.in
@@ -58,7 +58,7 @@ follow_imports = "silent"
 
 [tool.pylint.main]
 ignore = ["tests"]
-extension-pkg-whitelist = ["numpy"]
+extension-pkg-whitelist = ["numpy", "cuda"]
 disable = [
     "import-error",
     "attribute-defined-outside-init",

From 1b9b737ec751915938802d750d2be7aa166afe9a Mon Sep 17 00:00:00 2001
From: Kunal Jani <168258319+kunaljani1100@users.noreply.github.com>
Date: Sat, 27 Sep 2025 23:06:21 -0700
Subject: [PATCH 173/224] [doc][jvm] Explanation for `isMaximizeEvaluation`.
 (#11691)

---
 .../src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java  | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
index 71b4ff3f2873..f94ab78588e6 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
@@ -386,6 +386,16 @@ private static String getMetricNameFromlog(String evalInfo, String[] evalNames)
   }
 
   // visiable for testing
+
+  /**
+   * Decides whether the evaluation metrics are to be maximized or not.
+   *
+   * @param evalInfo The evaluation log string from which the metric name is inferred.
+   * @param evalNames The names of the evaluation matrices.
+   * @param params The parameters that contain information regarding whether the
+   *  evaluation metrics are to be maximized or not.
+   * @return True if the evaluation metrics are to be maximized, false otherwise.
+   */
   public static boolean isMaximizeEvaluation(String evalInfo,
                                              String[] evalNames,
                                              Map<String, Object> params) {

From 2e86f5e86afbb1565c95f704b28fea2accd4a6ac Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sun, 28 Sep 2025 12:40:23 -0400
Subject: [PATCH 174/224] [doc] Update references from XGBoost Operator to
 Kubeflow Trainer (#11710)

---
 doc/tutorials/kubernetes.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/tutorials/kubernetes.rst b/doc/tutorials/kubernetes.rst
index 7f7994b659e4..c2adac22c236 100644
--- a/doc/tutorials/kubernetes.rst
+++ b/doc/tutorials/kubernetes.rst
@@ -2,33 +2,33 @@
 Distributed XGBoost on Kubernetes
 ###################################
 
-Distributed XGBoost training and batch prediction on `Kubernetes <https://kubernetes.io/>`_ are supported via `Kubeflow XGBoost Training Operator <https://github.com/kubeflow/training-operator>`_.
+Distributed XGBoost training and batch prediction on `Kubernetes <https://kubernetes.io/>`_ are supported via `Kubeflow Trainer <https://github.com/kubeflow/trainer>`_.
 
 ************
 Instructions
 ************
 In order to run a XGBoost job in a Kubernetes cluster, perform the following steps:
 
-1. Install XGBoost Operator on the Kubernetes cluster.
+1. Install Kubeflow Trainer on the Kubernetes cluster.
 
-   a. XGBoost Operator is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://www.kubeflow.org/docs/components/training/xgboost/#installing-xgboost-operator>`_ to install XGBoost Operator.
+   a. Kubeflow Trainer is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://www.kubeflow.org/docs/components/trainer/>`_ to install it.
 
-2. Write application code that will be executed by the XGBoost Operator.
+2. Write application code that will be executed by the Kubeflow Trainer.
 
-   a. To use XGBoost Operator, you'll have to write a couple of Python scripts that implement the distributed training logic for XGBoost. Please refer to the `Iris classification example <https://github.com/kubeflow/training-operator/tree/master/examples/xgboost/xgboost-dist>`_.
+   a. To use Kubeflow Trainer, you'll have to write a couple of Python scripts that implement the distributed training logic for XGBoost. Please refer to the `Iris classification example <https://github.com/kubeflow/trainer/tree/master/examples/xgboost/xgboost-dist>`_.
    b. Data reader/writer: you need to implement the data reader and writer based on the specific requirements of your chosen data source. For example, if your dataset is stored in a Hive table, you have to write the code to read from or write to the Hive table based on the index of the worker.
-   c. Model persistence: in the `Iris classification example <https://github.com/kubeflow/training-operator/tree/master/examples/xgboost/xgboost-dist>`_, the model is stored in `Alibaba OSS <https://www.alibabacloud.com/product/oss>`_. If you want to store your model in other storages such as Amazon S3 or Google NFS, you'll need to implement the model persistence logic based on the requirements of the chosen storage system.
+   c. Model persistence: in the `Iris classification example <https://github.com/kubeflow/trainer/tree/master/examples/xgboost/xgboost-dist>`_, the model is stored in `Alibaba OSS <https://www.alibabacloud.com/product/oss>`_. If you want to store your model in other storages such as Amazon S3 or Google NFS, you'll need to implement the model persistence logic based on the requirements of the chosen storage system.
 
 3. Configure the XGBoost job using a YAML file.
 
-   a. YAML file is used to configure the computational resources and environment for your XGBoost job to run, e.g. the number of workers/masters and the number of CPU/GPUs. Please refer to this `YAML template <https://github.com/kubeflow/training-operator/blob/master/examples/xgboost/xgboost-dist/xgboostjob_v1alpha1_iris_train.yaml>`_ for an example.
+   a. YAML file is used to configure the computational resources and environment for your XGBoost job to run, e.g. the number of workers/masters and the number of CPU/GPUs. Please refer to this `YAML template <https://github.com/kubeflow/trainer/blob/master/examples/xgboost/xgboost-dist/xgboostjob_v1alpha1_iris_train.yaml>`_ for an example.
 
 4. Submit XGBoost job to a Kubernetes cluster.
 
-   a. Use `kubectl <https://kubernetes.io/docs/reference/kubectl/overview/>`_ to submit a distributed XGBoost job as illustrated `here <https://www.kubeflow.org/docs/components/training/xgboost/#creating-a-xgboost-training-job>`_.
+   a. Use `kubectl <https://kubernetes.io/docs/reference/kubectl/overview/>`_ to submit a distributed XGBoost job as illustrated `here <https://www.kubeflow.org/docs/components/trainer/>`_.
 
 *******
 Support
 *******
 
-Please submit an issue on `XGBoost Operator repo <https://github.com/kubeflow/training-operator/issues>`_ for any feature requests or problems.
+Please submit an issue on `Kubeflow Trainer repo <https://github.com/kubeflow/trainer/issues>`_ for any feature requests or problems.

From 867aa52e67eaea5415ebe99f1c88dad6760d05c6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 29 Sep 2025 12:05:46 +0800
Subject: [PATCH 175/224] Bump the dev version to 3.2. (#11709)

---
 CMakeLists.txt                           |  2 +-
 R-package/DESCRIPTION                    |  4 ++--
 R-package/configure                      | 26 ++++++++++++------------
 R-package/configure.ac                   |  2 +-
 include/xgboost/version_config.h         |  2 +-
 jvm-packages/pom.xml                     |  2 +-
 jvm-packages/xgboost4j-example/pom.xml   |  4 ++--
 jvm-packages/xgboost4j-flink/pom.xml     |  4 ++--
 jvm-packages/xgboost4j-spark-gpu/pom.xml |  4 ++--
 jvm-packages/xgboost4j-spark/pom.xml     |  2 +-
 jvm-packages/xgboost4j/pom.xml           |  4 ++--
 python-package/pyproject.toml            |  4 ++--
 python-package/pyproject.toml.in         |  2 +-
 python-package/pyproject.toml.stub.in    |  2 +-
 python-package/xgboost/VERSION           |  2 +-
 15 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41f1b1f48a9c..d191cd25088e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ if(PLUGIN_SYCL)
   string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
-project(xgboost LANGUAGES CXX C VERSION 3.1.0)
+project(xgboost LANGUAGES CXX C VERSION 3.2.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 4aeb51857403..db54f4775d71 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 3.1.0.0
-Date: 2025-03-03
+Version: 3.2.0.0
+Date: 2025-09-27
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
          email = "tianqi.tchen@gmail.com"),
diff --git a/R-package/configure b/R-package/configure
index fc4594db9458..cb54bc2bebdb 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for xgboost 3.1.0.
+# Generated by GNU Autoconf 2.71 for xgboost 3.2.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='xgboost'
 PACKAGE_TARNAME='xgboost'
-PACKAGE_VERSION='3.1.0'
-PACKAGE_STRING='xgboost 3.1.0'
+PACKAGE_VERSION='3.2.0'
+PACKAGE_STRING='xgboost 3.2.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1262,7 +1262,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures xgboost 3.1.0 to adapt to many kinds of systems.
+\`configure' configures xgboost 3.2.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1324,7 +1324,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of xgboost 3.1.0:";;
+     short | recursive ) echo "Configuration of xgboost 3.2.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1407,7 +1407,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-xgboost configure 3.1.0
+xgboost configure 3.2.0
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1668,7 +1668,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by xgboost $as_me 3.1.0, which was
+It was created by xgboost $as_me 3.2.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -2796,11 +2796,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
 printf %s "checking for $CXX option to enable C++11 features... " >&6; }
-if test ${ac_cv_prog_cxx_11+y}
+if test ${ac_cv_prog_cxx_cxx11+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_11=no
+  ac_cv_prog_cxx_cxx11=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -2842,11 +2842,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
 printf %s "checking for $CXX option to enable C++98 features... " >&6; }
-if test ${ac_cv_prog_cxx_98+y}
+if test ${ac_cv_prog_cxx_cxx98+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_98=no
+  ac_cv_prog_cxx_cxx98=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -3855,7 +3855,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by xgboost $as_me 3.1.0, which was
+This file was extended by xgboost $as_me 3.2.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -3919,7 +3919,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-xgboost config.status 3.1.0
+xgboost config.status 3.2.0
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/R-package/configure.ac b/R-package/configure.ac
index fb5a28b5a95f..68369c9275f5 100644
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -2,7 +2,7 @@
 
 AC_PREREQ(2.69)
 
-AC_INIT([xgboost],[3.1.0],[],[xgboost],[])
+AC_INIT([xgboost],[3.2.0],[],[xgboost],[])
 
 : ${R_HOME=`R RHOME`}
 if test -z "${R_HOME}"; then
diff --git a/include/xgboost/version_config.h b/include/xgboost/version_config.h
index 785984174b2e..53ae594c2590 100644
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -5,7 +5,7 @@
 #define XGBOOST_VERSION_CONFIG_H_
 
 #define XGBOOST_VER_MAJOR 3  /* NOLINT */
-#define XGBOOST_VER_MINOR 1  /* NOLINT */
+#define XGBOOST_VER_MINOR 2  /* NOLINT */
 #define XGBOOST_VER_PATCH 0  /* NOLINT */
 
 #endif  // XGBOOST_VERSION_CONFIG_H_
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 882f43b1b3ce..4573d6160db2 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>3.1.0-SNAPSHOT</version>
+    <version>3.2.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
     <description>JVM Package for XGBoost</description>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index 9a8408124c63..ded576df3b58 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>3.1.0-SNAPSHOT</version>
+        <version>3.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-example</name>
     <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>3.1.0-SNAPSHOT</version>
+    <version>3.2.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>
         <plugins>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index 96fe0563d499..adce92a8675f 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,12 +6,12 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>3.1.0-SNAPSHOT</version>
+        <version>3.2.0-SNAPSHOT</version>
     </parent>
 
     <name>xgboost4j-flink</name>
     <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>3.1.0-SNAPSHOT</version>
+    <version>3.2.0-SNAPSHOT</version>
     <properties>
       <flink-ml.version>2.2.0</flink-ml.version>
     </properties>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index d3d2138846a1..014d0fe7bb8d 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,12 +6,12 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>3.1.0-SNAPSHOT</version>
+        <version>3.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark-gpu</name>
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
-    <version>3.1.0-SNAPSHOT</version>
+    <version>3.2.0-SNAPSHOT</version>
     <description>JVM Package for XGBoost</description>
     <url>https://github.com/dmlc/xgboost/tree/master/jvm-packages</url>
     <licenses>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 904c97a08bcd..c44858132775 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>3.1.0-SNAPSHOT</version>
+        <version>3.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark</name>
     <artifactId>xgboost4j-spark_2.12</artifactId>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index a828cd44a005..db737711960b 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>3.1.0-SNAPSHOT</version>
+        <version>3.2.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j</name>
     <artifactId>xgboost4j_2.12</artifactId>
-    <version>3.1.0-SNAPSHOT</version>
+    <version>3.2.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <dependencies>
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 59d5ae204793..d2106e8182a8 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -15,7 +15,7 @@ authors = [
     { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
     { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
 ]
-version = "3.1.0-dev"
+version = "3.2.0-dev"
 requires-python = ">=3.10"
 license = { text = "Apache-2.0" }
 classifiers = [
@@ -45,7 +45,7 @@ pandas = ["pandas>=1.2"]
 scikit-learn = ["scikit-learn"]
 dask = ["dask", "pandas", "distributed"]
 plotting = ["graphviz", "matplotlib"]
-pyspark = ["pyspark", "scikit-learn", "cloudpickle"]
+pyspark = ["pyspark>=3.4", "scikit-learn", "cloudpickle"]
 
 [tool.hatch.build.targets.wheel.hooks.custom]
 
diff --git a/python-package/pyproject.toml.in b/python-package/pyproject.toml.in
index 5cb7a8ba478e..74dccb458eb5 100644
--- a/python-package/pyproject.toml.in
+++ b/python-package/pyproject.toml.in
@@ -14,7 +14,7 @@ authors = [
     { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
     { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
 ]
-version = "3.1.0-dev"
+version = "3.2.0-dev"
 requires-python = ">=3.10"
 license = { text = "Apache-2.0" }
 classifiers = [
diff --git a/python-package/pyproject.toml.stub.in b/python-package/pyproject.toml.stub.in
index 23945b5787ec..2a869746cbf8 100644
--- a/python-package/pyproject.toml.stub.in
+++ b/python-package/pyproject.toml.stub.in
@@ -12,7 +12,7 @@ authors = [
     { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
     { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
 ]
-version = "3.1.0-dev"
+version = "3.2.0-dev"
 requires-python = ">=3.10"
 license = { text = "Apache-2.0" }
 classifiers = [
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
index 0f9d6b15dc04..df4a76732016 100644
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-3.1.0-dev
+3.2.0-dev

From 5546965639ca41dddfd69f15f6f1c68213e87767 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <dmitry.razdoburdin@intel.com>
Date: Mon, 29 Sep 2025 13:48:36 +0200
Subject: [PATCH 176/224] [sycl] Fix set device (#11712)

Co-authored-by: Dmitry Razdoburdin <>
---
 plugin/sycl/data/gradient_index.cc | 2 ++
 plugin/sycl/predictor/predictor.cc | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
index 2c8e988dfbaf..4f29f2d28fc6 100644
--- a/plugin/sycl/data/gradient_index.cc
+++ b/plugin/sycl/data/gradient_index.cc
@@ -61,6 +61,8 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
   BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
 
   for (auto &batch : dmat->GetBatches<SparsePage>()) {
+    batch.data.SetDevice(ctx->Device());
+    batch.offset.SetDevice(ctx->Device());
     const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
     const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
     size_t batch_size = batch.Size();
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index 26c5f80ef6f8..dc58951038ef 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -206,7 +206,9 @@ class Predictor : public xgboost::Predictor {
       xgboost::Predictor::Predictor{context},
       cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)),
       qu_(device_manager.GetQueue(context->Device())),
-      device_prop_(qu_->get_device()) {}
+      device_prop_(qu_->get_device()) {
+        device_model.SetDevice(context->Device());
+      }
 
   void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
                     const gbm::GBTreeModel &model, bst_tree_t tree_begin,

From e8f041d0f7eaf1731de13397c00188d1601e8f43 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 29 Sep 2025 09:01:22 -0700
Subject: [PATCH 177/224] [backport] [CI] Patch to fix wheel build for WinARM64
 (#11702) (#11703)

* [CI] Patch to fix wheel build for WinARM64 (#11702)

* [CI] Additional fix for WinARM64

* [CI] Additional fix for WinARM64
---
 .github/workflows/python_wheels_winarm64.yml | 32 +++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python_wheels_winarm64.yml b/.github/workflows/python_wheels_winarm64.yml
index edccf52ff333..13ac0ad9c15c 100644
--- a/.github/workflows/python_wheels_winarm64.yml
+++ b/.github/workflows/python_wheels_winarm64.yml
@@ -34,7 +34,7 @@ jobs:
       - name: Install build dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install wheel setuptools awscli
+          python -m pip install wheel setuptools awscli packaging
 
       - name: Build XGBoost for Win-ARM64
         run: |
@@ -42,18 +42,42 @@ jobs:
           cd build
           cmake .. -G"Visual Studio 17 2022" -A ARM64
           cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal"
+
+      - name: Build Python wheel xgboost for Win-ARM64
+        run: |
+          cd python-package
+          mkdir -p wheelhouse
+          pip wheel --no-deps -v . --wheel-dir wheelhouse/
+          $wheelFile = Get-ChildItem wheelhouse/*.whl | Select-Object -First 1 -ExpandProperty FullName
+          python -m wheel tags --python-tag py3 --abi-tag none --platform win_arm64 --remove $wheelFile
+
+      - name: Upload Python wheel xgboost
+        if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+        run: |
+          $wheelFile = Get-ChildItem python-package/wheelhouse/*.whl | Select-Object -First 1 -ExpandProperty FullName
+          python ops/pipeline/manage-artifacts.py upload `
+            --s3-bucket xgboost-nightly-builds `
+            --prefix ${{ env.BRANCH_NAME }}/${{ github.sha }} --make-public `
+            $wheelFile
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
+
+      - name: Clean up
+        run: |
+          $wheelFile = Get-ChildItem python-package/wheelhouse/*.whl | Select-Object -First 1 -ExpandProperty FullName
+          Remove-Item -Path $wheelFile -Verbose
           
-      - name: Build XGBoost Python wheel for Win-ARM64
+      - name: Build Python wheel xgboost-cpu for Win-ARM64
         run: |
           # Patch to rename pkg to xgboost-cpu
           python ops/script/pypi_variants.py --use-suffix=cpu --require-nccl-dep=na
           cd python-package
-          mkdir -p wheelhouse
           pip wheel --no-deps -v . --wheel-dir wheelhouse/
           $wheelFile = Get-ChildItem wheelhouse/*.whl | Select-Object -First 1 -ExpandProperty FullName
           python -m wheel tags --python-tag py3 --abi-tag none --platform win_arm64 --remove $wheelFile
 
-      - name: Upload Python wheel
+      - name: Upload Python wheel xgboost-cpu
         if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
         run: |
           $wheelFile = Get-ChildItem python-package/wheelhouse/*.whl | Select-Object -First 1 -ExpandProperty FullName

From a3a57866dee8c22b131001d9e8cbc165ee68005d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 30 Sep 2025 00:12:00 +0800
Subject: [PATCH 178/224] Fix wasm build. (#11708)

---
 CMakeLists.txt               |  4 +++-
 cmake/Utils.cmake            | 10 ++++++----
 src/c_api/c_api.cc           |  4 +++-
 src/common/threading_utils.h |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d191cd25088e..59ff1e165ba7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -253,7 +253,9 @@ if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
 endif()
 
-find_package(Threads REQUIRED)
+if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Emscripten"))
+  find_package(Threads REQUIRED)
+endif()
 
 # -- OpenMP
 include(cmake/FindOpenMPMacOS.cmake)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index ebc38de68607..50a464f053a1 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -261,10 +261,12 @@ endmacro()
 
 # handles dependencies
 macro(xgboost_target_link_libraries target)
-  if(BUILD_STATIC_LIB)
-    target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
-  else()
-    target_link_libraries(${target} PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
+  if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Emscripten"))
+    if(BUILD_STATIC_LIB)
+      target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
+    else()
+      target_link_libraries(${target} PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
+    endif()
   endif()
 
   if(USE_OPENMP)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 173856d213d8..75794e49d0c1 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1579,7 +1579,9 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, const void *buf,
   API_BEGIN();
   CHECK_HANDLE();
   xgboost_CHECK_C_ARG_PTR(buf);
-  auto buffer = common::Span<char const>{static_cast<char const *>(buf), len};
+  using CharT = std::add_const_t<char>;
+  using IdxType = common::Span<CharT>::index_type;
+  auto buffer = common::Span{static_cast<CharT *>(buf), static_cast<IdxType>(len)};
   // Don't warn, we have to guess the format with buffer input.
   auto in = DispatchModelType(buffer, "", false);
   common::MemoryFixSizeBuffer fs((void *)buf, len);  // NOLINT(*)
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 4bbd280a21ee..b20b1bc08069 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -267,7 +267,7 @@ void ParallelFor1d(Index size, std::int32_t n_threads, Func&& fn) {
   static_assert(std::is_void_v<std::invoke_result_t<Func, common::Range1d>>);
   auto const n_blocks = DivRoundUp(size, kBlockOfRowsSize);
   common::ParallelFor(n_blocks, n_threads, [&](auto block_id) {
-    auto const block_beg = block_id * kBlockOfRowsSize;
+    std::size_t const block_beg = block_id * kBlockOfRowsSize;
     auto const block_size = std::min(static_cast<std::size_t>(size - block_beg), kBlockOfRowsSize);
     fn(common::Range1d{block_beg, block_beg + block_size});
   });

From 470573a941cbfc73302e439ce1d83c872ff11963 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 30 Sep 2025 09:46:06 +0800
Subject: [PATCH 179/224] Extract the stream, event, and stream view. (#11706)

---
 plugin/federated/federated_comm.cuh           |  6 +-
 src/collective/coll.cu                        | 27 +++---
 src/collective/coll.cuh                       | 16 +--
 src/collective/comm.cu                        |  4 +-
 src/collective/comm.cuh                       | 14 +--
 src/common/algorithm.cuh                      | 82 +++++++++-------
 src/common/cuda_context.cuh                   | 15 +--
 src/common/cuda_stream.h                      | 97 +++++++++++++++++++
 ...uda_stream_pool.cuh => cuda_stream_pool.h} | 10 +-
 src/common/device_compression.cu              | 17 ++--
 src/common/device_compression.cuh             | 11 ++-
 src/common/device_helpers.cuh                 | 96 ++----------------
 src/common/host_device_vector.cu              | 13 +--
 src/common/ref_resource_view.cuh              |  3 +-
 src/common/resource.cu                        |  7 +-
 src/common/resource.cuh                       |  6 +-
 src/data/array_interface.cu                   | 11 +--
 src/data/cat_container.cuh                    |  4 +-
 src/data/ellpack_page.cu                      |  3 +-
 src/data/ellpack_page_raw_format.cu           | 14 +--
 src/data/ellpack_page_source.cu               |  7 +-
 src/objective/adaptive.cu                     |  7 +-
 src/tree/gpu_hist/evaluate_splits.cu          |  7 +-
 src/tree/gpu_hist/evaluate_splits.cuh         |  5 +-
 src/tree/gpu_hist/evaluator.cu                |  7 +-
 src/tree/updater_gpu_hist.cu                  |  3 +-
 tests/cpp/common/test_cuda_host_allocator.cu  | 11 ++-
 tests/cpp/common/test_cuda_rt_utils.cu        |  2 +-
 tests/cpp/common/test_ref_resource_view.cu    |  4 +-
 tests/cpp/data/test_array_interface.cu        |  4 +-
 tests/cpp/data/test_ellpack_page.cu           |  3 +-
 31 files changed, 276 insertions(+), 240 deletions(-)
 create mode 100644 src/common/cuda_stream.h
 rename src/common/{cuda_stream_pool.cuh => cuda_stream_pool.h} (65%)

diff --git a/plugin/federated/federated_comm.cuh b/plugin/federated/federated_comm.cuh
index 85cecb3eb331..b9474a46a96e 100644
--- a/plugin/federated/federated_comm.cuh
+++ b/plugin/federated/federated_comm.cuh
@@ -1,18 +1,18 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #pragma once
 
 #include <memory>  // for shared_ptr
 
 #include "../../src/collective/coll.h"          // for Coll
-#include "../../src/common/device_helpers.cuh"  // for CUDAStreamView
+#include "../../src/common/cuda_stream.h"       // for StreamRef
 #include "federated_comm.h"                     // for FederatedComm
 #include "xgboost/context.h"                    // for Context
 
 namespace xgboost::collective {
 class CUDAFederatedComm : public FederatedComm {
-  dh::CUDAStreamView stream_;
+  curt::StreamRef stream_;
 
  public:
   explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
diff --git a/src/collective/coll.cu b/src/collective/coll.cu
index 83f88eaaf8fe..2e708592426b 100644
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -13,7 +13,8 @@
 #include <type_traits>  // for invoke_result_t, is_same_v, enable_if_t
 #include <utility>      // for move
 
-#include "../common/device_helpers.cuh"  // for CUDAStreamView, CUDAEvent, device_vector
+#include "../common/cuda_stream.h"       // for StreamRef, Event
+#include "../common/device_helpers.cuh"  // for device_vector
 #include "../common/threadpool.h"        // for ThreadPool
 #include "../common/utils.h"             // for MakeCleanup
 #include "../data/array_interface.h"     // for ArrayInterfaceHandler
@@ -87,16 +88,16 @@ struct Chan {
 };
 }  // namespace
 
-template <typename Fn, typename R = std::invoke_result_t<Fn, dh::CUDAStreamView>>
+template <typename Fn, typename R = std::invoke_result_t<Fn, curt::StreamRef>>
 [[nodiscard]] std::enable_if_t<std::is_same_v<R, Result>, Result> AsyncLaunch(
     common::ThreadPool* pool, NCCLComm const* nccl, std::shared_ptr<NcclStub> stub,
-    dh::CUDAStreamView stream, Fn&& fn) {
-  dh::CUDAEvent e0;
+    curt::StreamRef stream, Fn&& fn) {
+  curt::Event e0;
   e0.Record(nccl->Stream());
   stream.Wait(e0);
 
   auto cleanup = common::MakeCleanup([&] {
-    dh::CUDAEvent e1;
+    curt::Event e1;
     e1.Record(stream);
     nccl->Stream().Wait(e1);
   });
@@ -180,7 +181,7 @@ bool IsBitwiseOp(Op const& op) {
 }
 
 template <typename Func>
-void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> out_buffer,
+void RunBitwiseAllreduce(curt::StreamRef stream, common::Span<std::int8_t> out_buffer,
                          std::int8_t const* device_buffer, Func func, std::int32_t world_size,
                          std::size_t size) {
   dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
@@ -194,13 +195,13 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
 
 [[nodiscard]] Result BitwiseAllReduce(common::ThreadPool* pool, NCCLComm const* pcomm,
                                       common::Span<std::int8_t> data, Op op,
-                                      dh::CUDAStreamView stream) {
+                                      curt::StreamRef stream) {
   dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
   auto* device_buffer = buffer.data().get();
   auto stub = pcomm->Stub();
 
   // First gather data from all the workers.
-  auto rc = AsyncLaunch(pool, pcomm, stub, stream, [&](dh::CUDAStreamView s) {
+  auto rc = AsyncLaunch(pool, pcomm, stub, stream, [&](curt::StreamRef s) {
     return stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8, pcomm->Handle(), s);
   });
   if (!rc.OK()) {
@@ -263,7 +264,7 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
         using T = decltype(t);
         auto rdata = common::RestoreType<T>(data);
         return AsyncLaunch(
-            &this->pool_, nccl, stub, this->stream_.View(), [&](dh::CUDAStreamView s) {
+            &this->pool_, nccl, stub, this->stream_.View(), [&](curt::StreamRef s) {
               return stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
                                      GetNCCLRedOp(op), nccl->Handle(), s);
             });
@@ -285,7 +286,7 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
 
   return Success() << [&] {
     return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(),
-                       [data, nccl, root, stub](dh::CUDAStreamView s) {
+                       [data, nccl, root, stub](curt::StreamRef s) {
                          return stub->Broadcast(data.data(), data.data(), data.size_bytes(),
                                                 ncclInt8, root, nccl->Handle(), s);
                        });
@@ -306,7 +307,7 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
   auto send = data.subspan(comm.Rank() * size, size);
   return Success() << [&] {
     return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(),
-                       [send, data, size, nccl, stub](dh::CUDAStreamView s) {
+                       [send, data, size, nccl, stub](curt::StreamRef s) {
                          return stub->Allgather(send.data(), data.data(), size, ncclInt8,
                                                 nccl->Handle(), s);
                        });
@@ -321,7 +322,7 @@ namespace cuda_impl {
  *
  * https://arxiv.org/abs/1812.05964
  */
-Result BroadcastAllgatherV(NCCLComm const* comm, dh::CUDAStreamView s,
+Result BroadcastAllgatherV(NCCLComm const* comm, curt::StreamRef s,
                            common::Span<std::int8_t const> data,
                            common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
   auto stub = comm->Stub();
@@ -379,7 +380,7 @@ Result BroadcastAllgatherV(NCCLComm const* comm, dh::CUDAStreamView s,
       };
     }
     case AllgatherVAlgo::kBcast: {
-      return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(), [&](dh::CUDAStreamView s) {
+      return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(), [&](curt::StreamRef s) {
         return cuda_impl::BroadcastAllgatherV(nccl, s, data, sizes, recv);
       });
     }
diff --git a/src/collective/coll.cuh b/src/collective/coll.cuh
index 1ebd33c74626..084f89402866 100644
--- a/src/collective/coll.cuh
+++ b/src/collective/coll.cuh
@@ -1,21 +1,21 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #pragma once
 
 #include <cstdint>  // for int8_t, int64_t
 
-#include "../common/device_helpers.cuh"  // for CUDAStream
-#include "../common/threadpool.h"        // for ThreadPool
-#include "../data/array_interface.h"     // for ArrayInterfaceHandler
-#include "coll.h"                        // for Coll
-#include "comm.h"                        // for Comm
-#include "xgboost/span.h"                // for Span
+#include "../common/cuda_stream.h"    // for Stream
+#include "../common/threadpool.h"     // for ThreadPool
+#include "../data/array_interface.h"  // for ArrayInterfaceHandler
+#include "coll.h"                     // for Coll
+#include "comm.h"                     // for Comm
+#include "xgboost/span.h"             // for Span
 
 namespace xgboost::collective {
 class NCCLColl : public Coll {
   common::ThreadPool pool_;
-  dh::CUDAStream stream_;
+  curt::Stream stream_;
 
  public:
   NCCLColl();
diff --git a/src/collective/comm.cu b/src/collective/comm.cu
index a5c4a1d8845e..7894daed1a35 100644
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #if defined(XGBOOST_USE_NCCL)
 #include <algorithm>  // for sort
@@ -113,7 +113,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
 
   for (std::int32_t r = 0; r < root.World(); ++r) {
     this->channels_.emplace_back(
-        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
+        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, curt::DefaultStream()));
   }
 }
 
diff --git a/src/collective/comm.cuh b/src/collective/comm.cuh
index 4add9ca612e0..95d36e929176 100644
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #pragma once
 
@@ -9,7 +9,7 @@
 
 #include <utility>  // for move
 
-#include "../common/device_helpers.cuh"
+#include "../common/cuda_stream.h"  // for StreamRef
 #include "coll.h"
 #include "comm.h"
 #include "nccl_stub.h"  // for NcclStub
@@ -30,7 +30,7 @@ class NCCLComm : public Comm {
   ncclComm_t nccl_comm_{nullptr};
   std::shared_ptr<NcclStub> stub_;
   ncclUniqueId nccl_unique_id_{};
-  dh::CUDAStreamView stream_;
+  curt::StreamRef stream_;
   std::string nccl_path_;
 
  public:
@@ -45,7 +45,7 @@ class NCCLComm : public Comm {
   }
   ~NCCLComm() override;
   [[nodiscard]] bool IsFederated() const override { return false; }
-  [[nodiscard]] dh::CUDAStreamView Stream() const { return stream_; }
+  [[nodiscard]] curt::StreamRef Stream() const { return stream_; }
   [[nodiscard]] Result Block() const override {
     auto rc = this->Stream().Sync(false);
     return GetCUDAResult(rc);
@@ -60,16 +60,16 @@ class NCCLChannel : public Channel {
   std::int32_t rank_{-1};
   ncclComm_t nccl_comm_{};
   std::shared_ptr<NcclStub> stub_;
-  dh::CUDAStreamView stream_;
+  curt::StreamRef stream_;
 
  public:
   explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
-                       std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
+                       std::shared_ptr<NcclStub> stub, curt::StreamRef stream)
       : rank_{rank},
         nccl_comm_{nccl_comm},
         stub_{std::move(stub)},
         Channel{comm, nullptr},
-        stream_{stream} {}
+        stream_{std::move(stream)} {}
 
   [[nodiscard]] Result SendAll(std::int8_t const* ptr, std::size_t n) override {
     return stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index 7b3e7dcce4ec..2fa48734c9a5 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -4,10 +4,10 @@
 #ifndef XGBOOST_COMMON_ALGORITHM_CUH_
 #define XGBOOST_COMMON_ALGORITHM_CUH_
 
-#include <thrust/copy.h>                         // for copy
-#include <thrust/iterator/counting_iterator.h>   // for make_counting_iterator
-#include <thrust/sort.h>                         // for stable_sort_by_key
-#include <thrust/tuple.h>                        // for tuple, get
+#include <thrust/copy.h>                        // for copy
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/sort.h>                        // for stable_sort_by_key
+#include <thrust/tuple.h>                       // for tuple, get
 
 #include <cstddef>      // size_t
 #include <cstdint>      // int32_t
@@ -18,11 +18,11 @@
 
 #include "common.h"            // safe_cuda
 #include "cuda_context.cuh"    // CUDAContext
+#include "cuda_stream.h"       // for StreamRef
 #include "device_helpers.cuh"  // TemporaryArray,SegmentId,LaunchN,Iota
 #include "device_vector.cuh"   // for device_vector
 #include "xgboost/base.h"      // XGBOOST_DEVICE
 #include "xgboost/context.h"   // Context
-#include "xgboost/linalg.h"    // for VectorView
 #include "xgboost/logging.h"   // CHECK
 #include "xgboost/span.h"      // Span,byte
 
@@ -30,11 +30,11 @@ namespace xgboost::common {
 namespace detail {
 
 #if CUB_VERSION >= 300000
-  constexpr auto kCubSortOrderAscending = cub::SortOrder::Ascending;
-  constexpr auto kCubSortOrderDescending = cub::SortOrder::Descending;
+constexpr auto kCubSortOrderAscending = cub::SortOrder::Ascending;
+constexpr auto kCubSortOrderDescending = cub::SortOrder::Descending;
 #else
-  constexpr bool kCubSortOrderAscending = false;
-  constexpr bool kCubSortOrderDescending = true;
+constexpr bool kCubSortOrderAscending = false;
+constexpr bool kCubSortOrderDescending = true;
 #endif
 
 // Wrapper around cub sort to define is_decending
@@ -70,7 +70,7 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
                                   const ValueT *d_values_in, ValueT *d_values_out,
                                   std::size_t num_items, std::size_t num_segments,
                                   BeginOffsetIteratorT d_begin_offsets,
-                                  EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
+                                  EndOffsetIteratorT d_end_offsets, curt::StreamRef stream,
                                   int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
   cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
   cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
@@ -198,7 +198,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                                if (thrust::get<0>(l) != thrust::get<0>(r)) {
                                  return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
                                }
-                               return thrust::get<1>(l) < thrust::get<1>(r);    // residue
+                               return thrust::get<1>(l) < thrust::get<1>(r);  // residue
                              });
 }
 
@@ -224,46 +224,54 @@ void ArgSort(Context const *ctx, Span<U> keys, Span<IdxT> sorted_idx) {
   if (accending) {
     void *d_temp_storage = nullptr;
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
     dh::TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
   } else {
     void *d_temp_storage = nullptr;
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
     dh::TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
   }
 
@@ -330,7 +338,7 @@ void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out
 }
 
 template <typename... Args>
-void RunLengthEncode(dh::CUDAStreamView stream, Args &&...args) {
+void RunLengthEncode(curt::StreamRef stream, Args &&...args) {
   std::size_t n_bytes = 0;
   dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(nullptr, n_bytes, args..., stream));
   dh::CachingDeviceUVector<char> tmp(n_bytes);
@@ -338,7 +346,7 @@ void RunLengthEncode(dh::CUDAStreamView stream, Args &&...args) {
 }
 
 template <typename... Args>
-void SegmentedSum(dh::CUDAStreamView stream, Args &&...args) {
+void SegmentedSum(curt::StreamRef stream, Args &&...args) {
   std::size_t n_bytes = 0;
   dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, args..., stream));
   dh::CachingDeviceUVector<char> tmp(n_bytes);
diff --git a/src/common/cuda_context.cuh b/src/common/cuda_context.cuh
index 7e1db8e3bf2f..c41cb0bc9d9a 100644
--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@@ -1,11 +1,12 @@
 /**
- * Copyright 2022-2023, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #include <thrust/execution_policy.h>
 
-#include "device_helpers.cuh"
+#include "cuda_stream.h"      // for DefaultStream
+#include "device_vector.cuh"  // for XGBCachingDeviceAllocator, XGBDeviceAllocator
 
 namespace xgboost {
 struct CUDAContext {
@@ -19,9 +20,9 @@ struct CUDAContext {
    */
   auto CTP() const {
 #if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
-    return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
+    return thrust::cuda::par_nosync(caching_alloc_).on(curt::DefaultStream());
 #else
-    return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
+    return thrust::cuda::par(caching_alloc_).on(curt::DefaultStream());
 #endif  // THRUST_MAJOR_VERSION >= 2
   }
   /**
@@ -29,12 +30,12 @@ struct CUDAContext {
    */
   auto TP() const {
 #if THRUST_MAJOR_VERSION >= 2
-    return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream());
+    return thrust::cuda::par_nosync(alloc_).on(curt::DefaultStream());
 #else
-    return thrust::cuda::par(alloc_).on(dh::DefaultStream());
+    return thrust::cuda::par(alloc_).on(curt::DefaultStream());
 #endif  // THRUST_MAJOR_VERSION >= 2
   }
-  auto Stream() const { return dh::DefaultStream(); }
+  auto Stream() const { return curt::DefaultStream(); }
 };
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_CUDA_CONTEXT_CUH_
diff --git a/src/common/cuda_stream.h b/src/common/cuda_stream.h
new file mode 100644
index 000000000000..546029861058
--- /dev/null
+++ b/src/common/cuda_stream.h
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2022-2025, XGBoost contributors
+ */
+#pragma once
+#include <cuda_runtime.h>
+
+#include <memory>   // for unique_ptr
+#include <utility>  // for swap
+
+#include "common.h"
+
+namespace xgboost::curt {
+class StreamRef;
+
+class Event {
+  std::unique_ptr<cudaEvent_t, void (*)(cudaEvent_t *)> event_;
+
+ public:
+  explicit Event(bool disable_timing = true)
+      : event_{[disable_timing] {
+                 auto e = new cudaEvent_t;
+                 dh::safe_cuda(cudaEventCreateWithFlags(
+                     e, disable_timing ? cudaEventDisableTiming : cudaEventDefault));
+                 return e;
+               }(),
+               [](cudaEvent_t *e) {
+                 if (e) {
+                   dh::safe_cuda(cudaEventDestroy(*e));
+                   delete e;
+                 }
+               }} {}
+
+  inline void Record(StreamRef stream);  // NOLINT
+  // Define swap-based ctor to make sure an event is always valid.
+  Event(Event &&e) : Event() { std::swap(this->event_, e.event_); }
+  Event &operator=(Event &&e) {
+    std::swap(this->event_, e.event_);
+    return *this;
+  }
+
+  operator cudaEvent_t() const { return *event_; }                // NOLINT
+  cudaEvent_t const *data() const { return this->event_.get(); }  // NOLINT
+  void Sync() { dh::safe_cuda(cudaEventSynchronize(*this->data())); }
+};
+
+class StreamRef {
+  cudaStream_t stream_{nullptr};
+
+ public:
+  explicit StreamRef(cudaStream_t s) : stream_{s} {}
+  void Wait(Event const &e) {
+#if defined(__CUDACC_VER_MAJOR__)
+#if __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0
+    // CUDA == 11.0
+    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, 0));
+#else
+    // CUDA > 11.0
+    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
+#endif  // __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0:
+#else   // clang
+    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
+#endif  //  defined(__CUDACC_VER_MAJOR__)
+  }
+  operator cudaStream_t() const {  // NOLINT
+    return stream_;
+  }
+  cudaError_t Sync(bool error = true) {
+    if (error) {
+      dh::safe_cuda(cudaStreamSynchronize(stream_));
+      return cudaSuccess;
+    }
+    return cudaStreamSynchronize(stream_);
+  }
+};
+
+inline void Event::Record(StreamRef stream) {  // NOLINT
+  dh::safe_cuda(cudaEventRecord(*event_, cudaStream_t{stream}));
+}
+
+// Changing this has effect on prediction return, where we need to pass the pointer to
+// third-party libraries like cuPy
+inline StreamRef DefaultStream() { return StreamRef{cudaStreamPerThread}; }
+
+class Stream {
+  cudaStream_t stream_;
+
+ public:
+  Stream() { dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); }
+  ~Stream() { dh::safe_cuda(cudaStreamDestroy(stream_)); }
+
+  [[nodiscard]] StreamRef View() const { return StreamRef{stream_}; }
+  [[nodiscard]] cudaStream_t Handle() const { return stream_; }
+
+  void Sync() { this->View().Sync(); }
+  void Wait(Event const &e) { this->View().Wait(e); }
+};
+}  // namespace xgboost::curt
diff --git a/src/common/cuda_stream_pool.cuh b/src/common/cuda_stream_pool.h
similarity index 65%
rename from src/common/cuda_stream_pool.cuh
rename to src/common/cuda_stream_pool.h
index 5339bb98ed4d..a3943ce00941 100644
--- a/src/common/cuda_stream_pool.cuh
+++ b/src/common/cuda_stream_pool.h
@@ -6,13 +6,13 @@
 #include <cstddef>  // for size_t
 #include <vector>   // for vector
 
-#include "device_helpers.cuh"  // for CUDAStreamView, CUDAStream
+#include "cuda_stream.h"       // for StreamRef, Stream
 
 namespace xgboost::curt {
 // rmm cuda_stream_pool
 class StreamPool {
   mutable std::atomic<std::size_t> next_{0};
-  std::vector<dh::CUDAStream> stream_;
+  std::vector<curt::Stream> stream_;
 
  public:
   explicit StreamPool(std::size_t n) : stream_(n) {}
@@ -20,10 +20,8 @@ class StreamPool {
   StreamPool(StreamPool const& that) = delete;
   StreamPool& operator=(StreamPool const& that) = delete;
 
-  [[nodiscard]] dh::CUDAStreamView operator[](std::size_t i) const { return stream_[i].View(); }
-  [[nodiscard]] dh::CUDAStreamView Next() const {
-    return stream_[(next_++) % stream_.size()].View();
-  }
+  [[nodiscard]] curt::StreamRef operator[](std::size_t i) const { return stream_[i].View(); }
+  [[nodiscard]] curt::StreamRef Next() const { return stream_[(next_++) % stream_.size()].View(); }
   [[nodiscard]] std::size_t Size() const { return stream_.size(); }
 };
 }  // namespace xgboost::curt
diff --git a/src/common/device_compression.cu b/src/common/device_compression.cu
index 3f3997221a81..287a7695f80a 100644
--- a/src/common/device_compression.cu
+++ b/src/common/device_compression.cu
@@ -11,7 +11,8 @@
 #include <memory>   // for shared_ptr
 
 #include "device_compression.cuh"
-#include "device_helpers.cuh"  // for CUDAStreamView, MemcpyBatchAsync
+#include "cuda_stream.h"        // for StreamRef
+#include "device_helpers.cuh"  // for MemcpyBatchAsync
 #include "xgboost/span.h"      // for Span
 
 #if defined(XGBOOST_USE_NVCOMP)
@@ -76,7 +77,7 @@ XGBOOST_DEVICE std::uint32_t GetUncompressedSize(std::uint8_t const* src, std::s
 void FillDecompParams(void const* const* d_in_chunk_ptrs, std::size_t const* d_in_chunk_nbytes,
                       common::Span<CUmemDecompressParams> de_params, size_t* d_act_nbytes,
                       std::size_t const* d_out_chunk_nbytes, std::int32_t* statuses,
-                      dh::CUDAStreamView stream) {
+                      curt::StreamRef stream) {
   auto n_chunks = de_params.size();
   dh::LaunchN(n_chunks, stream,
               [d_in_chunk_ptrs, d_in_chunk_nbytes, d_out_chunk_nbytes, d_act_nbytes, de_params,
@@ -155,7 +156,7 @@ void SafeNvComp(nvcompStatus_t status) {
   return de;
 }
 
-SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(dh::CUDAStreamView s,
+SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(curt::StreamRef s,
                                            std::shared_ptr<HostPinnedMemPool> pool,
                                            CuMemParams params,
                                            common::Span<std::uint8_t const> in_compressed_data)
@@ -253,7 +254,7 @@ SnappyDecomprMgr::~SnappyDecomprMgr() = default;
 
 SnappyDecomprMgrImpl* SnappyDecomprMgr::Impl() const { return this->pimpl_.get(); }
 
-void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
+void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
                       common::Span<common::CompressedByteT> out, bool allow_fallback) {
   xgboost_NVTX_FN_RANGE();
   auto mgr_impl = mgr.Impl();
@@ -409,7 +410,7 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
 }
 
 [[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
-    dh::CUDAStreamView stream, std::shared_ptr<HostPinnedMemPool> pool,
+    curt::StreamRef stream, std::shared_ptr<HostPinnedMemPool> pool,
     CuMemParams const& in_params, dh::DeviceUVector<std::uint8_t> const& in_buf,
     CuMemParams* p_out) {
   std::size_t n_total_act_bytes = in_params.TotalSrcActBytes();
@@ -462,7 +463,7 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
 
 namespace xgboost::dc {
 // Impl
-SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(dh::CUDAStreamView,
+SnappyDecomprMgrImpl::SnappyDecomprMgrImpl(curt::StreamRef,
                                            std::shared_ptr<common::cuda_impl::HostPinnedMemPool>,
                                            CuMemParams,
                                            common::Span<common::CompressedByteT const>) {}
@@ -478,7 +479,7 @@ SnappyDecomprMgrImpl* SnappyDecomprMgr::Impl() const { return nullptr; }
 [[nodiscard]] std::size_t SnappyDecomprMgr::DecompressedBytes() const { return 0; }
 
 // Round-trip compression
-void DecompressSnappy(dh::CUDAStreamView, SnappyDecomprMgr const&,
+void DecompressSnappy(curt::StreamRef, SnappyDecomprMgr const&,
                       common::Span<common::CompressedByteT>, bool) {
   common::AssertNvCompSupport();
 }
@@ -494,7 +495,7 @@ void DecompressSnappy(dh::CUDAStreamView, SnappyDecomprMgr const&,
 }
 
 [[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
-    dh::CUDAStreamView, std::shared_ptr<HostPinnedMemPool>, CuMemParams const& in_params,
+    curt::StreamRef, std::shared_ptr<HostPinnedMemPool>, CuMemParams const& in_params,
     dh::DeviceUVector<std::uint8_t> const&, CuMemParams*) {
   std::size_t n_total_bytes = in_params.TotalSrcBytes();
   if (n_total_bytes == 0) {
diff --git a/src/common/device_compression.cuh b/src/common/device_compression.cuh
index b1e9dbc0312f..6ab3e62719d4 100644
--- a/src/common/device_compression.cuh
+++ b/src/common/device_compression.cuh
@@ -9,6 +9,7 @@
 #include "compressed_iterator.h"    // for CompressedByteT
 #include "cuda_dr_utils.h"          // for CUDA_HW_DECOM_AVAILABLE
 #include "cuda_pinned_allocator.h"  // for HostPinnedMemPool
+#include "cuda_stream.h"            // for StreamRef
 #include "device_compression.h"     // for CuMemParams
 #include "device_vector.cuh"        // for DeviceUVector
 #include "ref_resource_view.h"      // for RefResourceView
@@ -40,7 +41,7 @@ using HostPinnedMemPool = common::cuda_impl::HostPinnedMemPool;
  * @param allow_fallback Allow fallback to nvcomp implementation if hardware accelerated
  *   implementation is not available. Used for testing.
  */
-void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
+void DecompressSnappy(curt::StreamRef stream, SnappyDecomprMgr const& mgr,
                       common::Span<common::CompressedByteT> out, bool allow_fallback);
 
 /**
@@ -53,7 +54,7 @@ void DecompressSnappy(dh::CUDAStreamView stream, SnappyDecomprMgr const& mgr,
  * @param p_out Re-newed parameters to keep track of the buffers.
  */
 [[nodiscard]] common::RefResourceView<std::uint8_t> CoalesceCompressedBuffersToHost(
-    dh::CUDAStreamView stream, std::shared_ptr<HostPinnedMemPool> pool,
+    curt::StreamRef stream, std::shared_ptr<HostPinnedMemPool> pool,
     CuMemParams const& in_params, dh::DeviceUVector<std::uint8_t> const& in_buf,
     CuMemParams* p_out);
 
@@ -87,7 +88,7 @@ struct SnappyDecomprMgrImpl {
 #endif  // defined(CUDA_HW_DECOM_AVAILABLE)
   }
 
-  SnappyDecomprMgrImpl(dh::CUDAStreamView s, std::shared_ptr<HostPinnedMemPool> pool,
+  SnappyDecomprMgrImpl(curt::StreamRef s, std::shared_ptr<HostPinnedMemPool> pool,
                        CuMemParams params, common::Span<std::uint8_t const> in_compressed_data);
 
 #if defined(CUDA_HW_DECOM_AVAILABLE) && defined(XGBOOST_USE_NVCOMP)
@@ -106,14 +107,14 @@ struct SnappyDecomprMgrImpl {
 
 #if defined(XGBOOST_USE_NVCOMP)
 [[nodiscard]] inline auto MakeSnappyDecomprMgr(
-    dh::CUDAStreamView s, std::shared_ptr<HostPinnedMemPool> pool, CuMemParams params,
+    curt::StreamRef s, std::shared_ptr<HostPinnedMemPool> pool, CuMemParams params,
     common::Span<std::uint8_t const> in_compressed_data) {
   SnappyDecomprMgr mgr;
   *mgr.Impl() = SnappyDecomprMgrImpl{s, std::move(pool), std::move(params), in_compressed_data};
   return mgr;
 }
 #else
-[[nodiscard]] inline auto MakeSnappyDecomprMgr(dh::CUDAStreamView,
+[[nodiscard]] inline auto MakeSnappyDecomprMgr(curt::StreamRef,
                                                std::shared_ptr<HostPinnedMemPool>, CuMemParams,
                                                common::Span<std::uint8_t const>) {
   SnappyDecomprMgr mgr;
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 558e540d7d42..25163ace3619 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #pragma once
 #include <thrust/binary_search.h>                       // thrust::upper_bound
@@ -21,6 +21,7 @@
 
 #include "common.h"
 #include "cuda_rt_utils.h"  // for GetNumaId, CurrentDevice
+#include "cuda_stream.h"    // for Stream
 #include "device_vector.cuh"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
@@ -720,93 +721,9 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
   return aggregate;
 }
 
-class CUDAStreamView;
-
-class CUDAEvent {
-  std::unique_ptr<cudaEvent_t, void (*)(cudaEvent_t *)> event_;
-
- public:
-  explicit CUDAEvent(bool disable_timing = true)
-      : event_{[disable_timing] {
-                 auto e = new cudaEvent_t;
-                 dh::safe_cuda(cudaEventCreateWithFlags(
-                     e, disable_timing ? cudaEventDisableTiming : cudaEventDefault));
-                 return e;
-               }(),
-               [](cudaEvent_t *e) {
-                 if (e) {
-                   dh::safe_cuda(cudaEventDestroy(*e));
-                   delete e;
-                 }
-               }} {}
-
-  inline void Record(CUDAStreamView stream);  // NOLINT
-  // Define swap-based ctor to make sure an event is always valid.
-  CUDAEvent(CUDAEvent &&e) : CUDAEvent() { std::swap(this->event_, e.event_); }
-  CUDAEvent &operator=(CUDAEvent &&e) {
-    std::swap(this->event_, e.event_);
-    return *this;
-  }
-
-  operator cudaEvent_t() const { return *event_; }                // NOLINT
-  cudaEvent_t const *data() const { return this->event_.get(); }  // NOLINT
-  void Sync() { dh::safe_cuda(cudaEventSynchronize(*this->data())); }
-};
-
-class CUDAStreamView {
-  cudaStream_t stream_{nullptr};
-
- public:
-  explicit CUDAStreamView(cudaStream_t s) : stream_{s} {}
-  void Wait(CUDAEvent const &e) {
-#if defined(__CUDACC_VER_MAJOR__)
-#if __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0
-    // CUDA == 11.0
-    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, 0));
-#else
-    // CUDA > 11.0
-    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
-#endif  // __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 0:
-#else   // clang
-    dh::safe_cuda(cudaStreamWaitEvent(stream_, cudaEvent_t{e}, cudaEventWaitDefault));
-#endif  //  defined(__CUDACC_VER_MAJOR__)
-  }
-  operator cudaStream_t() const {  // NOLINT
-    return stream_;
-  }
-  cudaError_t Sync(bool error = true) {
-    if (error) {
-      dh::safe_cuda(cudaStreamSynchronize(stream_));
-      return cudaSuccess;
-    }
-    return cudaStreamSynchronize(stream_);
-  }
-};
-
-inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
-  dh::safe_cuda(cudaEventRecord(*event_, cudaStream_t{stream}));
-}
-
-// Changing this has effect on prediction return, where we need to pass the pointer to
-// third-party libraries like cuPy
-inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamPerThread}; }
-
-class CUDAStream {
-  cudaStream_t stream_;
-
- public:
-  CUDAStream() { dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); }
-  ~CUDAStream() { dh::safe_cuda(cudaStreamDestroy(stream_)); }
-
-  [[nodiscard]] CUDAStreamView View() const { return CUDAStreamView{stream_}; }
-  [[nodiscard]] cudaStream_t Handle() const { return stream_; }
-
-  void Sync() { this->View().Sync(); }
-  void Wait(CUDAEvent const &e) { this->View().Wait(e); }
-};
-
 template <class Src, class Dst>
-void CopyTo(Src const &src, Dst *dst, CUDAStreamView stream = DefaultStream()) {
+void CopyTo(Src const &src, Dst *dst,
+            ::xgboost::curt::StreamRef stream = ::xgboost::curt::DefaultStream()) {
   if (src.empty()) {
     dst->clear();
     return;
@@ -819,7 +736,6 @@ void CopyTo(Src const &src, Dst *dst, CUDAStreamView stream = DefaultStream()) {
                                 src.size() * sizeof(SVT), cudaMemcpyDefault, stream));
 }
 
-
 /**
  * @brief Wrapper for the @ref cudaMemcpyBatchAsync .
  *
@@ -868,9 +784,9 @@ template <cudaMemcpyKind kind, typename T, typename U>
 inline auto CachingThrustPolicy() {
   XGBCachingDeviceAllocator<char> alloc;
 #if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
-  return thrust::cuda::par_nosync(alloc).on(DefaultStream());
+  return thrust::cuda::par_nosync(alloc).on(::xgboost::curt::DefaultStream());
 #else
-  return thrust::cuda::par(alloc).on(DefaultStream());
+  return thrust::cuda::par(alloc).on(::xgboost::curt::DefaultStream());
 #endif  // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
 }
 
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index f4ec79539678..bb50fb50cefa 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <thrust/fill.h>
 
@@ -7,6 +7,7 @@
 #include <cstddef>  // for size_t
 #include <cstdint>
 
+#include "cuda_stream.h"  // for DefaultStream
 #include "device_helpers.cuh"
 #include "device_vector.cuh"  // for DeviceUVector
 #include "xgboost/data.h"
@@ -92,7 +93,7 @@ class HostDeviceVectorImpl {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto s_data = dh::ToSpan(*data_d_);
-      dh::LaunchN(data_d_->size(), dh::DefaultStream(),
+      dh::LaunchN(data_d_->size(), curt::DefaultStream(),
                   [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
     }
   }
@@ -141,7 +142,7 @@ class HostDeviceVectorImpl {
       CHECK_EQ(this->Device(), other->Device());
       dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
                                     other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
-                                    dh::DefaultStream()));
+                                    curt::DefaultStream()));
     }
   }
 
@@ -215,7 +216,7 @@ class HostDeviceVectorImpl {
     LazyResizeDevice(data_h_.size());
     SetDevice();
     dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T),
-                                  cudaMemcpyHostToDevice, dh::DefaultStream()));
+                                  cudaMemcpyHostToDevice, curt::DefaultStream()));
     gpu_access_ = access;
   }
 
@@ -242,7 +243,7 @@ class HostDeviceVectorImpl {
       SetDevice();
       dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), other->data_d_->data(),
                                     data_d_->size() * sizeof(T), cudaMemcpyDefault,
-                                    dh::DefaultStream()));
+                                    curt::DefaultStream()));
     }
   }
 
@@ -251,7 +252,7 @@ class HostDeviceVectorImpl {
     gpu_access_ = GPUAccess::kWrite;
     SetDevice();
     dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), begin, data_d_->size() * sizeof(T),
-                                  cudaMemcpyDefault, dh::DefaultStream()));
+                                  cudaMemcpyDefault, curt::DefaultStream()));
   }
 
   void LazyResizeDevice(size_t new_size) {
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
index af2ae7d5a07c..bc8b7d7c010a 100644
--- a/src/common/ref_resource_view.cuh
+++ b/src/common/ref_resource_view.cuh
@@ -7,6 +7,7 @@
 #include <memory>   // for make_shared
 
 #include "cuda_context.cuh"     // for CUDAContext
+#include "cuda_stream.h"        // for StreamRef
 #include "ref_resource_view.h"  // for RefResourceView
 #include "resource.cuh"         // for CudaAllocResource
 #include "xgboost/context.h"    // for Context
@@ -53,7 +54,7 @@ template <typename T>
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithPinnedMemPool(
     std::shared_ptr<cuda_impl::HostPinnedMemPool> pool, std::size_t n_elements,
-    dh::CUDAStreamView stream) {
+    curt::StreamRef stream) {
   auto resource = std::make_shared<common::HostPinnedMemPoolResource>(
       std::move(pool), n_elements * sizeof(T), stream);
   auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
diff --git a/src/common/resource.cu b/src/common/resource.cu
index 162c1a04af80..f317c70de1f1 100644
--- a/src/common/resource.cu
+++ b/src/common/resource.cu
@@ -1,6 +1,7 @@
 /**
  * Copyright 2024-2025, XGBoost Contributors
  */
+#include "cuda_stream.h"       // for DefaultStream
 #include "device_helpers.cuh"  // for CurrentDevice
 #include "resource.cuh"
 #include "xgboost/string_view.h"  // for StringView
@@ -12,7 +13,7 @@ CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::siz
               [](MMAPFile* handle) {
                 // Don't close the mmap while CUDA kernel is running.
                 if (handle) {
-                  dh::DefaultStream().Sync();
+                  curt::DefaultStream().Sync();
                 }
                 detail::CloseMmap(handle);
               }},
@@ -30,9 +31,9 @@ CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::siz
   dh::safe_cuda(cudaMemAdvise(ptr.data(), ptr.size(), cudaMemAdviseSetPreferredLocation, loc));
   dh::safe_cuda(cudaMemAdvise(ptr.data(), ptr.size(), cudaMemAdviseSetAccessedBy, loc));
 #if (CUDA_VERSION / 1000) >= 13
-  dh::safe_cuda(cudaMemPrefetchAsync(ptr.data(), ptr.size(), loc, 0, dh::DefaultStream()));
+  dh::safe_cuda(cudaMemPrefetchAsync(ptr.data(), ptr.size(), loc, 0, curt::DefaultStream()));
 #else
-  dh::safe_cuda(cudaMemPrefetchAsync(ptr.data(), ptr.size(), device, dh::DefaultStream()));
+  dh::safe_cuda(cudaMemPrefetchAsync(ptr.data(), ptr.size(), device, curt::DefaultStream()));
 #endif  // (CUDA_VERSION / 1000) >= 13
 }
 
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
index 9789c0bc9cf4..0760ec4fa81f 100644
--- a/src/common/resource.cuh
+++ b/src/common/resource.cuh
@@ -7,7 +7,7 @@
 #include <utility>     // for move
 
 #include "cuda_pinned_allocator.h"  // for SamAllocator, HostPinnedMemPool
-#include "device_helpers.cuh"       // for CUDAStreamView
+#include "cuda_stream.h"            // for StreamRef
 #include "device_vector.cuh"        // for DeviceUVector, GrowOnlyVirtualMemVec
 #include "io.h"                     // for ResourceHandler, MMAPFile
 #include "xgboost/string_view.h"    // for StringView
@@ -85,12 +85,12 @@ class CudaPinnedResource : public ResourceHandler {
 class HostPinnedMemPoolResource : public ResourceHandler {
   std::shared_ptr<cuda_impl::HostPinnedMemPool> pool_;
   std::size_t n_bytes_;
-  dh::CUDAStreamView stream_;
+  curt::StreamRef stream_;
   void* ptr_;
 
  public:
   explicit HostPinnedMemPoolResource(std::shared_ptr<cuda_impl::HostPinnedMemPool> pool,
-                                     std::size_t n_bytes, dh::CUDAStreamView stream)
+                                     std::size_t n_bytes, curt::StreamRef stream)
       : ResourceHandler{kCudaPinnedMemPool},
         pool_{std::move(pool)},
         n_bytes_{n_bytes},
diff --git a/src/data/array_interface.cu b/src/data/array_interface.cu
index 5be1d0b77420..bfedc841af5a 100644
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@@ -1,10 +1,9 @@
 /**
- * Copyright 2021-2023, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #include <cstdint>  // for int64_t
 
-#include "../common/common.h"
-#include "../common/device_helpers.cuh"  // for DefaultStream, CUDAEvent
+#include "../common/cuda_stream.h"  // for Event, StreamRef, DefaultStream
 #include "array_interface.h"
 #include "xgboost/logging.h"
 
@@ -27,9 +26,9 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
     case 2:
       // default per-thread stream
     default: {
-      dh::CUDAEvent e;
-      e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
-      dh::DefaultStream().Wait(e);
+      curt::Event e;
+      e.Record(curt::StreamRef{reinterpret_cast<cudaStream_t>(stream)});
+      curt::DefaultStream().Wait(e);
     }
   }
 }
diff --git a/src/data/cat_container.cuh b/src/data/cat_container.cuh
index b0a81ff2f15c..318b45b0d414 100644
--- a/src/data/cat_container.cuh
+++ b/src/data/cat_container.cuh
@@ -62,10 +62,10 @@ struct EncThrustPolicy {
 
   [[nodiscard]] auto ThrustPolicy() const {
     dh::XGBCachingDeviceAllocator<char> alloc;
-    auto exec = thrust::cuda::par_nosync(alloc).on(dh::DefaultStream());
+    auto exec = thrust::cuda::par_nosync(alloc).on(curt::DefaultStream());
     return exec;
   }
-  [[nodiscard]] auto Stream() const { return dh::DefaultStream(); }
+  [[nodiscard]] auto Stream() const { return curt::DefaultStream(); }
 };
 
 using EncPolicyT = enc::Policy<EncErrorPolicy, EncThrustPolicy>;
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 2de63bb626c7..622e5397f273 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -15,6 +15,7 @@
 #include "../common/categorical.h"          // for IsCat
 #include "../common/cuda_context.cuh"       // for CUDAContext
 #include "../common/cuda_rt_utils.h"        // for SetDevice
+#include "../common/cuda_stream.h"          // for DefaultStream
 #include "../common/hist_util.cuh"          // for HistogramCuts
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
 #include "../common/transform_iterator.h"   // for MakeIndexTransformIter
@@ -496,7 +497,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
 
 EllpackPageImpl::~EllpackPageImpl() noexcept(false) {
   // Sync the stream to make sure all running CUDA kernels finish before deallocation.
-  auto status = dh::DefaultStream().Sync(false);
+  auto status = curt::DefaultStream().Sync(false);
   if (status != cudaSuccess) {
     auto str = cudaGetErrorString(status);
     // For external-memory, throwing here can trigger a series of calls to
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 9691d32adf88..a5a2b3748100 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -7,6 +7,7 @@
 #include <vector>   // for vector
 
 #include "../common/cuda_rt_utils.h"
+#include "../common/cuda_stream.h"          // for Event
 #include "../common/io.h"                   // for AlignedResourceReadStream, AlignedFileWriteStream
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
 #include "../common/ref_resource_view.h"    // for ReadVec, WriteVec
@@ -40,7 +41,8 @@ template <typename T>
   }
 
   *vec = common::MakeFixedVecWithCudaMalloc<T>(n);
-  dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
+  dh::safe_cuda(
+      cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, curt::DefaultStream()));
   return true;
 }
 }  // namespace
@@ -71,7 +73,7 @@ template <typename T>
 
   impl->SetCuts(this->cuts_);
 
-  dh::DefaultStream().Sync();
+  curt::DefaultStream().Sync();
   return true;
 }
 
@@ -92,7 +94,7 @@ template <typename T>
   bytes += fo->Write(impl->base_rowid);
   bytes += fo->Write(impl->NumSymbols());
 
-  dh::DefaultStream().Sync();
+  curt::DefaultStream().Sync();
   return bytes;
 }
 
@@ -110,7 +112,7 @@ template <typename T>
   };
 
   if (ConsoleLogger::GlobalVerbosity() == ConsoleLogger::LogVerbosity::kDebug) {
-    dh::CUDAEvent start{false}, stop{false};
+    curt::Event start{false}, stop{false};
     float milliseconds = 0;
     start.Record(ctx.CUDACtx()->Stream());
 
@@ -126,7 +128,7 @@ template <typename T>
     dispatch();
   }
 
-  dh::DefaultStream().Sync();
+  curt::DefaultStream().Sync();
 
   return true;
 }
@@ -136,7 +138,7 @@ template <typename T>
   xgboost_NVTX_FN_RANGE_C(3, 252, 198);
 
   bool new_page = fo->Write(page);
-  dh::DefaultStream().Sync();
+  curt::DefaultStream().Sync();
 
   if (new_page) {
     auto cache = fo->Share();
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 2ce528e1711f..5cddd94996da 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -11,9 +11,10 @@
 #include "../common/common.h"                // for HumanMemUnit, safe_cuda
 #include "../common/cuda_dr_utils.h"         // for CUDA_HW_DECOM_AVAILABLE
 #include "../common/cuda_rt_utils.h"         // for SetDevice, GetDrVersionGlobal
-#include "../common/cuda_stream_pool.cuh"    // for StreamPool
+#include "../common/cuda_stream.h"           // for StreamRef, DefaultStream, Event
+#include "../common/cuda_stream_pool.h"      // for StreamPool
 #include "../common/device_compression.cuh"  // for CompressSnappy, MakeSnappyDecomprMgr
-#include "../common/device_helpers.cuh"      // for CUDAStreamView, DefaultStream
+#include "../common/device_helpers.cuh"      // for CurrentDevice
 #include "../common/numa_topo.h"             // for NumaMemCanCross, GetNumaMemBind
 #include "../common/ref_resource_view.cuh"   // for MakeFixedVecWithCudaMalloc
 #include "../common/resource.cuh"            // for PrivateCudaMmapConstStream
@@ -334,7 +335,7 @@ class EllpackHostCacheStreamImpl {
         auto out = out_impl->gidx_buffer.ToSpan().subspan(h_page->gidx_buffer.size_bytes(),
                                                           c_page->first.DecompressedBytes());
         dc::DecompressSnappy(stream, c_page->first, out, this->cache_->allow_decomp_fallback);
-        dh::CUDAEvent e;
+        curt::Event e;
         e.Record(stream);
         ctx->CUDACtx()->Stream().Wait(e);
       }
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 7f2a9175d91d..056bb0db2bc8 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #include <thrust/sort.h>
 
@@ -8,6 +8,7 @@
 
 #include "../collective/aggregator.h"
 #include "../common/cuda_context.cuh"  // CUDAContext
+#include "../common/cuda_stream.h"     // for Event, Stream
 #include "../common/device_helpers.cuh"
 #include "../common/stats.cuh"
 #include "../tree/sample_position.h"  // for SamplePosition
@@ -69,10 +70,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 
   dh::PinnedMemory pinned_pool;
   auto pinned = pinned_pool.GetSpan<char>(sizeof(size_t) + sizeof(bst_node_t));
-  dh::CUDAStream copy_stream;
+  curt::Stream copy_stream;
   size_t* h_num_runs = reinterpret_cast<size_t*>(pinned.subspan(0, sizeof(size_t)).data());
 
-  dh::CUDAEvent e;
+  curt::Event e;
   e.Record(cuctx->Stream());
   copy_stream.View().Wait(e);
   // flag for whether there's ignored position
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index e1d6d2e67d69..d4ea5b88d5a1 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #include <algorithm>  // for :max
 #include <limits>     // for numeric_limits
@@ -8,6 +8,7 @@
 #include "../../collective/communicator-inl.h"  // for GetWorldSize, GetRank
 #include "../../common/categorical.h"
 #include "../../common/cuda_context.cuh"  // for CUDAContext
+#include "../../common/cuda_stream.h"     // for Event
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
 
@@ -391,8 +392,8 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
   if (!has_categoricals_) return;
   auto d_cats = this->DeviceCatStorage(nidx);
   auto h_cats = this->HostCatStorage(nidx);
-  dh::CUDAEvent event;
-  event.Record(dh::DefaultStream());
+  curt::Event event;
+  event.Record(curt::DefaultStream());
   for (auto idx : nidx) {
     copy_stream_.View().Wait(event);
     dh::safe_cuda(cudaMemcpyAsync(
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index b1c4f4e33343..63c01d5361af 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #ifndef EVALUATE_SPLITS_CUH_
 #define EVALUATE_SPLITS_CUH_
@@ -7,6 +7,7 @@
 
 #include "../../common/categorical.h"
 #include "../../common/cuda_pinned_allocator.h"
+#include "../../common/cuda_stream.h"  // for Stream
 #include "../split_evaluator.h"
 #include "../updater_gpu_common.cuh"  // for DeviceSplitCandidate
 #include "expand_entry.cuh"
@@ -65,7 +66,7 @@ class GPUHistEvaluator {
   // host storage for categories for each node, used for sort based splits.
   std::vector<CatST, Alloc> h_split_cats_;
   // stream for copying categories from device back to host for expanding the decision tree.
-  dh::CUDAStream copy_stream_;
+  curt::Stream copy_stream_;
   // storage for sorted index of feature histogram, used for sort based splits.
   dh::device_vector<bst_feature_t> cat_sorted_idx_;
   // cached input for sorting the histogram, used for sort based splits.
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 90e83c2c3b3d..f4d3d64d5189 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022-2023 by XGBoost Contributors
+/**
+ * Copyright 2022-2025, XGBoost Contributors
  *
  * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
  *        compilation time.
@@ -8,6 +8,7 @@
 #include <thrust/sort.h>     // thrust::stable_sort
 
 #include "../../common/cuda_context.cuh"  // for CUDAContext
+#include "../../common/cuda_stream.h"     // for DefaultStream
 #include "../../common/device_helpers.cuh"
 #include "../../common/hist_util.h"  // common::HistogramCuts
 #include "evaluate_splits.cuh"
@@ -70,7 +71,7 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
     TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
   dh::XGBCachingDeviceAllocator<char> alloc;
   auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
-  dh::Iota(sorted_idx, dh::DefaultStream());
+  dh::Iota(sorted_idx, curt::DefaultStream());
   auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size());
   auto it = thrust::make_counting_iterator(0u);
   auto d_feature_idx = dh::ToSpan(feature_idx_);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index bec01f6ef5fc..1708f7c6f032 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -15,6 +15,7 @@
 #include "../common/categorical.h"     // for KCatBitField
 #include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/cuda_rt_utils.h"   // for SetDevice
+#include "../common/cuda_stream.h"     // for DefaultStream
 #include "../common/device_helpers.cuh"
 #include "../common/device_vector.cuh"  // for device_vector
 #include "../common/hist_util.h"        // for HistogramCuts
@@ -791,7 +792,7 @@ struct GPUHistMakerDevice {
       this->PartitionAndBuildHist(p_fmat, expand_set, valid_candidates, p_tree);
 
       this->EvaluateSplits(p_fmat, valid_candidates, *p_tree, new_candidates);
-      dh::DefaultStream().Sync();
+      curt::DefaultStream().Sync();
 
       driver.Push(new_candidates.begin(), new_candidates.end());
       expand_set = driver.Pop();
diff --git a/tests/cpp/common/test_cuda_host_allocator.cu b/tests/cpp/common/test_cuda_host_allocator.cu
index 3e4b57ff182e..c7cad2d07d93 100644
--- a/tests/cpp/common/test_cuda_host_allocator.cu
+++ b/tests/cpp/common/test_cuda_host_allocator.cu
@@ -7,8 +7,9 @@
 #include <vector>
 
 #include "../../../src/common/cuda_pinned_allocator.h"
-#include "../../../src/common/device_helpers.cuh"  // for DefaultStream
-#include "../../../src/common/numeric.h"           // for Iota
+#include "../../../src/common/cuda_stream.h"       // for DefaultStream
+#include "../../../src/common/device_helpers.cuh"
+#include "../../../src/common/numeric.h"      // for Iota
 
 namespace xgboost {
 TEST(CudaHostMalloc, Pinned) {
@@ -33,12 +34,12 @@ TEST(CudaHostMalloc, Managed) {
   loc.type = cudaMemLocationTypeDevice;
   loc.id = 0;
   dh::safe_cuda(
-      cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), loc, 0, dh::DefaultStream()));
+      cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), loc, 0, curt::DefaultStream()));
 #else
   dh::safe_cuda(
-      cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), 0, dh::DefaultStream()));
+      cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), 0, curt::DefaultStream()));
 #endif  // (CUDA_VERSION / 1000) >= 13
 #endif
-  dh::DefaultStream().Sync();
+  curt::DefaultStream().Sync();
 }
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_cuda_rt_utils.cu b/tests/cpp/common/test_cuda_rt_utils.cu
index 192c2cf3366c..0465fa65194e 100644
--- a/tests/cpp/common/test_cuda_rt_utils.cu
+++ b/tests/cpp/common/test_cuda_rt_utils.cu
@@ -7,7 +7,7 @@
 #include <cstdint>  // for int32_t
 #include <set>      // for set
 
-#include "../../../src/common/cuda_stream_pool.cuh"
+#include "../../../src/common/cuda_stream_pool.h"
 
 namespace xgboost::curt {
 TEST(RtUtils, StreamPool) {
diff --git a/tests/cpp/common/test_ref_resource_view.cu b/tests/cpp/common/test_ref_resource_view.cu
index 169e26648309..b29849bfbc1d 100644
--- a/tests/cpp/common/test_ref_resource_view.cu
+++ b/tests/cpp/common/test_ref_resource_view.cu
@@ -53,7 +53,7 @@ TEST(HostPinnedMemPool, Alloc) {
     // pool goes out of scope before refs does. Test memory safety.
     auto pool = std::make_shared<cuda_impl::HostPinnedMemPool>();
     for (std::size_t i = 0; i < 4; ++i) {
-      auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, dh::DefaultStream());
+      auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, curt::DefaultStream());
       refs.emplace_back(std::move(ref));
     }
     for (std::size_t i = 0; i < 4; ++i) {
@@ -69,7 +69,7 @@ TEST(HostPinnedMemPool, Alloc) {
     std::vector<std::future<RefResourceView<double>>> alloc_futs;
     for (std::int32_t i = 0, n = n_threads * 4; i < n; ++i) {
       auto fut = workers.Submit([i, pool] {
-        auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, dh::DefaultStream());
+        auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, curt::DefaultStream());
         return ref;
       });
       alloc_futs.emplace_back(std::move(fut));
diff --git a/tests/cpp/data/test_array_interface.cu b/tests/cpp/data/test_array_interface.cu
index be8160c8a493..36cba365b753 100644
--- a/tests/cpp/data/test_array_interface.cu
+++ b/tests/cpp/data/test_array_interface.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
@@ -23,7 +23,7 @@ TEST(ArrayInterface, Stream) {
   HostDeviceVector<float> storage;
   auto arr_str = RandomDataGenerator{kRows, kCols, 0}.GenerateArrayInterface(&storage);
 
-  dh::CUDAStream stream;
+  curt::Stream stream;
 
   auto j_arr = Json::Load(StringView{arr_str});
   j_arr["stream"] = Integer(reinterpret_cast<int64_t>(stream.Handle()));
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 274cda0107bd..96acb87a37a8 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -7,6 +7,7 @@
 
 #include "../../../src/common/categorical.h"          // for AsCat
 #include "../../../src/common/compressed_iterator.h"  // for CompressedByteT
+#include "../../../src/common/cuda_stream.h"          // for DefaultStream
 #include "../../../src/common/hist_util.h"
 #include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
 #include "../../../src/data/ellpack_page.cuh"
@@ -364,7 +365,7 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
                         false,      d_row_counts,    {},
                         n_features, n_samples,       cuts};
     this->CheckBasic(&ctx, batch, null_column, impl);
-    dh::DefaultStream().Sync();
+    curt::DefaultStream().Sync();
   }
 
   void CheckFromToGHist(std::size_t null_column) {

From ef312f0822b3c2830ce608e914b4e5f309042ebc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 1 Oct 2025 01:44:35 +0800
Subject: [PATCH 180/224] Implement the device uvector. (#11715)

---
 include/xgboost/span.h                 |  9 ++-
 src/common/device_vector.cu            |  4 +-
 src/common/device_vector.cuh           | 93 +++++++++++++++++---------
 tests/cpp/common/test_device_vector.cu | 31 +++++++--
 4 files changed, 99 insertions(+), 38 deletions(-)

diff --git a/include/xgboost/span.h b/include/xgboost/span.h
index 6692d2f68b73..afa2786058e8 100644
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2024, XGBoost contributors
+ * Copyright 2018-2025, XGBoost contributors
  * \brief span class based on ISO++20 span
  *
  * About NOLINTs in this file:
@@ -358,6 +358,11 @@ XGBOOST_DEVICE bool LexicographicalCompare(InputIt1 first1, InputIt1 last1,
 
 }  // namespace detail
 
+template <typename T>
+XGBOOST_DEVICE std::enable_if_t<!std::is_reference_v<T> && !std::is_pointer_v<T>, std::size_t>
+SizeBytes(std::size_t n) {
+  return n * sizeof(T);
+}
 
 /*!
  * \brief span class implementation, based on ISO++20 span<T>. The interface
@@ -556,7 +561,7 @@ class Span {
     return size_;
   }
   XGBOOST_DEVICE constexpr index_type size_bytes() const __span_noexcept {  // NOLINT
-    return size() * sizeof(T);
+    return SizeBytes<T>(size());
   }
 
   XGBOOST_DEVICE constexpr bool empty() const __span_noexcept {  // NOLINT
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
index b7f300df61e2..a88f54537301 100644
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@@ -99,7 +99,9 @@ GrowOnlyVirtualMemVec::GrowOnlyVirtualMemVec(CUmemLocationType type)
 
 #if defined(XGBOOST_USE_RMM)
 LoggingResource *GlobalLoggingResource() {
-  static auto mr{std::make_unique<LoggingResource>()};
+  static std::unique_ptr<LoggingResource> mr;
+  static std::once_flag flag;
+  std::call_once(flag, [&] { mr = std::make_unique<LoggingResource>(); });
   return mr.get();
 }
 #endif  // defined(XGBOOST_USE_RMM)
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 6fb4cfab2a85..735f437939bf 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost Contributors
+ * Copyright 2017-2025, XGBoost Contributors
  */
 #pragma once
 #include <thrust/device_malloc_allocator.h>  // for device_malloc_allocator
@@ -7,8 +7,6 @@
 #include <thrust/device_vector.h>            // for device_vector
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include <rmm/device_uvector.hpp>                      // for device_uvector
-#include <rmm/exec_policy.hpp>                         // for exec_policy_nosync
 #include <rmm/mr/device/device_memory_resource.hpp>    // for device_memory_resource
 #include <rmm/mr/device/per_device_resource.hpp>       // for get_current_device_resource
 #include <rmm/mr/device/thrust_allocator_adaptor.hpp>  // for thrust_allocator
@@ -37,6 +35,7 @@
 
 #include "common.h"         // for safe_cuda, HumanMemUnit
 #include "cuda_dr_utils.h"  // for CuDriverApi
+#include "cuda_stream.h"    // for DefaultStream
 #include "xgboost/logging.h"
 #include "xgboost/span.h"  // for Span
 
@@ -383,9 +382,9 @@ using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
  *         OOM errors.
  */
 template <typename T>
-using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
+using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>;  // NOLINT
 template <typename T>
-using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
+using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>;  // NOLINT
 
 #if defined(XGBOOST_USE_RMM)
 /**
@@ -450,10 +449,15 @@ template <typename T, bool is_caching>
 class DeviceUVectorImpl {
  private:
 #if defined(XGBOOST_USE_RMM)
-  rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
+  rmm::device_async_resource_ref mr_{GlobalLoggingResource()};
 #else
-  std::conditional_t<is_caching, ::dh::caching_device_vector<T>, ::dh::device_vector<T>> data_;
-#endif  // defined(XGBOOST_USE_RMM)
+  using Alloc =
+      std::conditional_t<is_caching, dh::XGBCachingDeviceAllocator<T>, dh::XGBDeviceAllocator<T>>;
+#endif
+
+  std::size_t size_{0};
+  std::size_t capacity_{0};
+  std::unique_ptr<T, std::function<void(T *)>> data_;
 
  public:
   using value_type = T;                        // NOLINT
@@ -470,47 +474,76 @@ class DeviceUVectorImpl {
   DeviceUVectorImpl(DeviceUVectorImpl &&that) = default;
   DeviceUVectorImpl &operator=(DeviceUVectorImpl &&that) = default;
 
+  [[nodiscard]] std::size_t Capacity() const { return this->capacity_; }
+
+  // Resize without init.
   void resize(std::size_t n) {  // NOLINT
+    using ::xgboost::common::SizeBytes;
+
+    if (n <= this->Capacity()) {
+      this->size_ = n;
+      // early exit as no allocation is needed.
+      return;
+    }
+    CHECK_LE(this->size(), this->Capacity());
+    auto s = ::xgboost::curt::DefaultStream();
+
+    decltype(data_) new_ptr{[n, this, s]() {
 #if defined(XGBOOST_USE_RMM)
-    data_.resize(n, rmm::cuda_stream_per_thread);
+                              auto n_bytes = SizeBytes<T>(n);
+                              auto p = this->mr_.allocate_async(n_bytes, rmm::cuda_stream_view{s});
+                              return static_cast<T *>(p);
 #else
-    data_.resize(n);
+                              auto p = Alloc{}.allocate(n);
+                              return thrust::raw_pointer_cast(p);
 #endif
-  }
-  void resize(std::size_t n, T const &v) {         // NOLINT
+                            }(),
+                            [n, this, s](T *ptr) {
+                              if (ptr) {
 #if defined(XGBOOST_USE_RMM)
+                                auto n_bytes = SizeBytes<T>(n);
+                                this->mr_.deallocate_async(ptr, n_bytes, rmm::cuda_stream_view{s});
+#else
+                                Alloc{}.deallocate(thrust::device_pointer_cast(ptr), n);
+#endif
+                              }
+                            }};
+    CHECK(new_ptr.get());
+    safe_cuda(cudaMemcpyAsync(new_ptr.get(), this->data(), SizeBytes<T>(this->size()),
+                              cudaMemcpyDefault, s));
+    this->size_ = n;
+    this->capacity_ = n;
+
+    std::swap(this->data_, new_ptr);
+  }
+  // Resize with init
+  void resize(std::size_t n, T const &v) {  // NOLINT
     auto orig = this->size();
-    data_.resize(n, rmm::cuda_stream_per_thread);
+    this->resize(n);
     if (orig < n) {
-      thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
+      auto exec = thrust::cuda::par_nosync.on(::xgboost::curt::DefaultStream());
+      thrust::fill(exec, this->begin() + orig, this->end(), v);
     }
-#else
-    data_.resize(n, v);
-#endif
   }
 
   void clear() {  // NOLINT
-#if defined(XGBOOST_USE_RMM)
-    this->data_.resize(0, rmm::cuda_stream_per_thread);
-#else
-    this->data_.clear();
-#endif  // defined(XGBOOST_USE_RMM)
+    this->resize(0);
   }
 
-  [[nodiscard]] std::size_t size() const { return data_.size(); }  // NOLINT
-  [[nodiscard]] bool empty() const { return this->size() == 0; }   // NOLINT
+  [[nodiscard]] std::size_t size() const { return this->size_; }  // NOLINT
+  [[nodiscard]] bool empty() const { return this->size() == 0; }  // NOLINT
 
-  [[nodiscard]] auto begin() { return data_.begin(); }  // NOLINT
-  [[nodiscard]] auto end() { return data_.end(); }      // NOLINT
+  [[nodiscard]] auto begin() { return this->data(); }               // NOLINT
+  [[nodiscard]] auto end() { return this->data() + this->size(); }  // NOLINT
 
   [[nodiscard]] auto begin() const { return this->cbegin(); }  // NOLINT
   [[nodiscard]] auto end() const { return this->cend(); }      // NOLINT
 
-  [[nodiscard]] auto cbegin() const { return data_.cbegin(); }  // NOLINT
-  [[nodiscard]] auto cend() const { return data_.cend(); }      // NOLINT
+  [[nodiscard]] auto cbegin() const { return this->data(); }               // NOLINT
+  [[nodiscard]] auto cend() const { return this->data() + this->size(); }  // NOLINT
 
-  [[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); }        // NOLINT
-  [[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); }  // NOLINT
+  [[nodiscard]] auto data() { return this->data_.get(); }        // NOLINT
+  [[nodiscard]] auto data() const { return this->data_.get(); }  // NOLINT
 };
 
 template <typename T>
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index 25b3c9bac4f6..b25e2d08827f 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -1,17 +1,18 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
-#include <thread>  // for thread
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/sequence.h>                    // for sequence
 
-#include <numeric>                     // for iota
-#include <thrust/detail/sequence.inl>  // for sequence
+#include <numeric>  // for iota
+#include <thread>   // for thread
 
 #include "../../../src/common/cuda_rt_utils.h"     // for DrVersion
 #include "../../../src/common/device_helpers.cuh"  // for CachingThrustPolicy, PinnedMemory
 #include "../../../src/common/device_vector.cuh"
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
-#include "xgboost/windefs.h"  // for xgboost_IS_WIN
+#include "xgboost/windefs.h"        // for xgboost_IS_WIN
 
 namespace dh {
 TEST(DeviceUVector, Basic) {
@@ -24,6 +25,26 @@ TEST(DeviceUVector, Basic) {
   auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
   ASSERT_EQ(peak, n_bytes);
   std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
+
+  DeviceUVector<double> uvec1{16};
+  ASSERT_EQ(uvec1.size(), 16);
+  uvec1.resize(3);
+  ASSERT_EQ(uvec1.size(), 3);
+  ASSERT_EQ(uvec1.Capacity(), 16);
+  ASSERT_EQ(std::distance(uvec1.begin(), uvec1.end()), uvec1.size());
+  auto orig = uvec1.size();
+
+  thrust::sequence(dh::CachingThrustPolicy(), uvec1.begin(), uvec1.end(), 0);
+  uvec1.resize(32);
+  ASSERT_EQ(uvec1.size(), 32);
+  ASSERT_EQ(uvec1.Capacity(), 32);
+  auto eq = thrust::equal(dh::CachingThrustPolicy(), uvec1.cbegin(), uvec1.cbegin() + orig,
+                          thrust::make_counting_iterator(0));
+  ASSERT_TRUE(eq);
+
+  uvec1.clear();
+  ASSERT_EQ(uvec1.size(), 0);
+  ASSERT_EQ(uvec1.Capacity(), 32);
 }
 
 #if defined(__linux__)

From 1fe4f067203cbeea12816b8371b012bef7f6a076 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 30 Sep 2025 16:16:49 -0700
Subject: [PATCH 181/224] [WheelNext] Use correct variant specification
 (#11717)

---
 ops/pipeline/build-variant-wheels-impl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ops/pipeline/build-variant-wheels-impl.sh b/ops/pipeline/build-variant-wheels-impl.sh
index 483b157f42fa..b033fc2f8571 100755
--- a/ops/pipeline/build-variant-wheels-impl.sh
+++ b/ops/pipeline/build-variant-wheels-impl.sh
@@ -20,5 +20,5 @@ python -m pip install "variantlib[cli] @ git+https://github.com/wheelnext/varian
 python -m pip install "nvidia-variant-provider @ git+https://github.com/wheelnext/nvidia-variant-provider.git@master"
 variantlib make-variant --no-isolation -f python-package/dist/xgboost-*.whl \
   -p "nvidia :: cuda_version_lower_bound :: 12.0" \
-  -p "nvidia :: cuda_version_upper_bound :: 12.9" \
+  -p "nvidia :: cuda_version_upper_bound :: 13" \
   -o . --pyproject-toml python-package/pyproject.toml

From 64a92f2ddf786ce99d74471837e1dda381728d86 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Oct 2025 02:18:51 +0800
Subject: [PATCH 182/224] Define a MR adapter for the thrust allocator.
 (#11718)

---
 include/xgboost/span.h        |   3 +-
 src/c_api/c_api.cu            |   7 +-
 src/common/device_helpers.cuh |  20 +++--
 src/common/device_vector.cu   |   9 ---
 src/common/device_vector.cuh  | 145 ++++++++++++----------------------
 5 files changed, 64 insertions(+), 120 deletions(-)

diff --git a/include/xgboost/span.h b/include/xgboost/span.h
index afa2786058e8..29b1cf9326bc 100644
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@@ -359,8 +359,7 @@ XGBOOST_DEVICE bool LexicographicalCompare(InputIt1 first1, InputIt1 last1,
 }  // namespace detail
 
 template <typename T>
-XGBOOST_DEVICE std::enable_if_t<!std::is_reference_v<T> && !std::is_pointer_v<T>, std::size_t>
-SizeBytes(std::size_t n) {
+XGBOOST_DEVICE std::enable_if_t<!std::is_reference_v<T>, std::size_t> SizeBytes(std::size_t n) {
   return n * sizeof(T);
 }
 
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 9d4f42d4d470..4a0d02107c21 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -5,7 +5,7 @@
 
 #include "../common/api_entry.h"       // for XGBAPIThreadLocalEntry
 #include "../common/cuda_context.cuh"  // for CUDAContext
-#include "../data/array_interface.h"  // for DispatchDType, ArrayInterface
+#include "../data/array_interface.h"   // for DispatchDType, ArrayInterface
 #include "../data/device_adapter.cuh"
 #include "../data/proxy_dmatrix.h"
 #include "c_api_error.h"
@@ -16,10 +16,13 @@
 #include "xgboost/learner.h"
 #if defined(XGBOOST_USE_NCCL)
 #include <nccl.h>
-#endif
+#endif  // defined(XGBOOST_USE_NCCL)
 #if defined(XGBOOST_USE_NVCOMP)
 #include <nvcomp/version.h>
 #endif  // defined(XGBOOST_USE_NVCOMP)
+#if defined(XGBOOST_USE_RMM)
+#include <rmm/version_config.hpp>
+#endif  // defined(XGBOOST_USE_RMM)
 
 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 25163ace3619..d55261ec3de0 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -16,6 +16,7 @@
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_type.cuh>  // for UnitWord, DoubleBuffer
+#include <cuda/std/iterator>  // for iterator_traits
 #include <variant>            // for variant, visit
 #include <vector>             // for vector
 
@@ -616,7 +617,7 @@ size_t SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy
                        KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
                        ValInIt val_last, KeyOutIt key_segments_out, ValOutIt val_out,
                        CompValue comp, CompKey comp_key = thrust::equal_to<size_t>{}) {
-  using Key = thrust::pair<size_t, typename thrust::iterator_traits<ValInIt>::value_type>;
+  using Key = thrust::pair<size_t, typename cuda::std::iterator_traits<ValInIt>::value_type>;
   auto unique_key_it = dh::MakeTransformIterator<Key>(
       thrust::make_counting_iterator(static_cast<size_t>(0)),
       [=] __device__(size_t i) {
@@ -662,16 +663,13 @@ size_t SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy
  * \tparam val_out            output iterator for values
  * \tparam comp               binary comparison operator
  */
-template <typename DerivedPolicy, typename SegInIt, typename SegOutIt,
-          typename KeyInIt, typename ValInIt, typename ValOutIt, typename Comp>
-size_t SegmentedUniqueByKey(
-    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-    SegInIt key_segments_first, SegInIt key_segments_last, KeyInIt key_first,
-    KeyInIt key_last, ValInIt val_first, SegOutIt key_segments_out,
-    ValOutIt val_out, Comp comp) {
-  using Key =
-      thrust::pair<size_t,
-                   typename thrust::iterator_traits<KeyInIt>::value_type>;
+template <typename DerivedPolicy, typename SegInIt, typename SegOutIt, typename KeyInIt,
+          typename ValInIt, typename ValOutIt, typename Comp>
+size_t SegmentedUniqueByKey(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            SegInIt key_segments_first, SegInIt key_segments_last,
+                            KeyInIt key_first, KeyInIt key_last, ValInIt val_first,
+                            SegOutIt key_segments_out, ValOutIt val_out, Comp comp) {
+  using Key = thrust::pair<size_t, typename cuda::std::iterator_traits<KeyInIt>::value_type>;
 
   auto unique_key_it = dh::MakeTransformIterator<Key>(
       thrust::make_counting_iterator(static_cast<size_t>(0)),
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
index a88f54537301..c82c6c15890e 100644
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@@ -96,13 +96,4 @@ GrowOnlyVirtualMemVec::GrowOnlyVirtualMemVec(CUmemLocationType type)
   return std::accumulate(it, it + this->va_ranges_.size(), static_cast<std::size_t>(0));
 }
 }  // namespace detail
-
-#if defined(XGBOOST_USE_RMM)
-LoggingResource *GlobalLoggingResource() {
-  static std::unique_ptr<LoggingResource> mr;
-  static std::once_flag flag;
-  std::call_once(flag, [&] { mr = std::make_unique<LoggingResource>(); });
-  return mr.get();
-}
-#endif  // defined(XGBOOST_USE_RMM)
 }  // namespace dh
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 735f437939bf..7d3eef35080a 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -7,20 +7,13 @@
 #include <thrust/device_vector.h>            // for device_vector
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include <rmm/mr/device/device_memory_resource.hpp>    // for device_memory_resource
-#include <rmm/mr/device/per_device_resource.hpp>       // for get_current_device_resource
-#include <rmm/mr/device/thrust_allocator_adaptor.hpp>  // for thrust_allocator
-#include <rmm/version_config.hpp>                      // for RMM_VERSION_MAJOR
+#include <cuda/memory_resource>                      // for async_resource_ref
+#include <cuda/stream_ref>                           // for stream_ref
+#include <rmm/mr/device/device_memory_resource.hpp>  // for device_memory_resource
+#include <rmm/mr/device/per_device_resource.hpp>     // for get_current_device_resource
 
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
 
-#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-
-#error "Please use RMM version 0.18 or later"
-#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
-#error "Please use RMM version 0.18 or later"
-#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
 #include <cuda.h>  // for CUmemGenericAllocationHandle
@@ -262,10 +255,45 @@ inline detail::MemoryLogger &GlobalMemoryLogger() {
   return memory_logger;
 }
 
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+using DeviceAsyncResourceRef = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
 namespace detail {
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+/**
+ * @brief Similar to `rmm::mr::thrust_allocator`.
+ */
+template <typename T>
+class ThrustAllocMrAdapter : public thrust::device_malloc_allocator<T> {
+  DeviceAsyncResourceRef mr_{rmm::mr::get_current_device_resource()};
+
+ public:
+  using Super = thrust::device_malloc_allocator<T>;
+  using pointer = typename Super::pointer;      // NOLINT(readability-identifier-naming)
+  using size_type = typename Super::size_type;  // NOLINT(readability-identifier-naming)
+
+  template <typename U>
+  struct rebind {                           // NOLINT(readability-identifier-naming)
+    using other = ThrustAllocMrAdapter<U>;  // NOLINT(readability-identifier-naming)
+  };
+
+  ThrustAllocMrAdapter() = default;
+  pointer allocate(size_type n) {  // NOLINT(readability-identifier-naming)
+    auto n_bytes = xgboost::common::SizeBytes<T>(n);
+    auto s = cuda::stream_ref{::xgboost::curt::DefaultStream()};
+    auto p = static_cast<T *>(mr_.allocate_async(n_bytes, std::alignment_of_v<T>, s));
+    return thrust::device_pointer_cast(p);
+  }
+  void deallocate(pointer ptr, size_type n) {  // NOLINT(readability-identifier-naming)
+    auto n_bytes = xgboost::common::SizeBytes<T>(n);
+    auto s = ::xgboost::curt::DefaultStream();
+    return mr_.deallocate_async(thrust::raw_pointer_cast(ptr), n_bytes, cuda::stream_ref{s});
+  }
+};
+
 template <typename T>
-using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
+using XGBBaseDeviceAllocator = ThrustAllocMrAdapter<T>;
 #else   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 template <typename T>
 using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
@@ -298,10 +326,7 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     GlobalMemoryLogger().RegisterDeallocation(n * sizeof(T));
     SuperT::deallocate(ptr, n);
   }
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBDefaultDeviceAllocatorImpl()
-      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBDefaultDeviceAllocatorImpl() : SuperT{} {}
 };
 
 /**
@@ -357,8 +382,7 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
   }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
   XGBCachingDeviceAllocatorImpl()
-      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
-        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
+      : SuperT{}, use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
 #endif                                   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
   XGBOOST_DEVICE void construct(T *) {}  // NOLINT
  private:
@@ -386,74 +410,15 @@ using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>;  // NOLIN
 template <typename T>
 using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>;  // NOLINT
 
-#if defined(XGBOOST_USE_RMM)
-/**
- * @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
- */
-class LoggingResource : public rmm::mr::device_memory_resource {
-  rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
-
- public:
-  LoggingResource() = default;
-  ~LoggingResource() override = default;
-  LoggingResource(LoggingResource const &) = delete;
-  LoggingResource &operator=(LoggingResource const &) = delete;
-  LoggingResource(LoggingResource &&) noexcept = delete;
-  LoggingResource &operator=(LoggingResource &&) noexcept = delete;
-
-  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept {  // NOLINT
-    return mr_;
-  }
-  [[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept {  // NOLINT
-    return mr_;
-  }
-
-  void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {  // NOLINT
-    try {
-      auto const ptr = mr_->allocate(bytes, stream);
-      GlobalMemoryLogger().RegisterAllocation(bytes);
-      return ptr;
-    } catch (rmm::bad_alloc const &e) {
-      detail::ThrowOOMError(e.what(), bytes);
-    }
-    return nullptr;
-  }
-
-  void do_deallocate(void *ptr, std::size_t bytes,  // NOLINT
-                     rmm::cuda_stream_view stream) override {
-    mr_->deallocate(ptr, bytes, stream);
-    GlobalMemoryLogger().RegisterDeallocation(bytes);
-  }
-
-  [[nodiscard]] bool do_is_equal(  // NOLINT
-      device_memory_resource const &other) const noexcept override {
-    if (this == &other) {
-      return true;
-    }
-    auto const *cast = dynamic_cast<LoggingResource const *>(&other);
-    if (cast == nullptr) {
-      return mr_->is_equal(other);
-    }
-    return get_upstream_resource() == cast->get_upstream_resource();
-  }
-};
-
-LoggingResource *GlobalLoggingResource();
-
-#endif  // defined(XGBOOST_USE_RMM)
-
 /**
  * @brief Container class that doesn't initialize the data when RMM is used.
  */
 template <typename T, bool is_caching>
 class DeviceUVectorImpl {
  private:
-#if defined(XGBOOST_USE_RMM)
-  rmm::device_async_resource_ref mr_{GlobalLoggingResource()};
-#else
   using Alloc =
       std::conditional_t<is_caching, dh::XGBCachingDeviceAllocator<T>, dh::XGBDeviceAllocator<T>>;
-#endif
+  Alloc alloc_;
 
   std::size_t size_{0};
   std::size_t capacity_{0};
@@ -486,29 +451,17 @@ class DeviceUVectorImpl {
       return;
     }
     CHECK_LE(this->size(), this->Capacity());
-    auto s = ::xgboost::curt::DefaultStream();
 
-    decltype(data_) new_ptr{[n, this, s]() {
-#if defined(XGBOOST_USE_RMM)
-                              auto n_bytes = SizeBytes<T>(n);
-                              auto p = this->mr_.allocate_async(n_bytes, rmm::cuda_stream_view{s});
-                              return static_cast<T *>(p);
-#else
-                              auto p = Alloc{}.allocate(n);
-                              return thrust::raw_pointer_cast(p);
-#endif
-                            }(),
-                            [n, this, s](T *ptr) {
+    Alloc alloc = this->alloc_;
+    decltype(data_) new_ptr{thrust::raw_pointer_cast(this->alloc_.allocate(n)),
+                            [=](T *ptr) mutable {
                               if (ptr) {
-#if defined(XGBOOST_USE_RMM)
-                                auto n_bytes = SizeBytes<T>(n);
-                                this->mr_.deallocate_async(ptr, n_bytes, rmm::cuda_stream_view{s});
-#else
-                                Alloc{}.deallocate(thrust::device_pointer_cast(ptr), n);
-#endif
+                                alloc.deallocate(thrust::device_pointer_cast(ptr), n);
                               }
                             }};
     CHECK(new_ptr.get());
+
+    auto s = ::xgboost::curt::DefaultStream();
     safe_cuda(cudaMemcpyAsync(new_ptr.get(), this->data(), SizeBytes<T>(this->size()),
                               cudaMemcpyDefault, s));
     this->size_ = n;

From 8f9097bd3c68de8c677f7eace984d2635df49b1b Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 2 Oct 2025 08:23:18 -0700
Subject: [PATCH 183/224] [CI] Update release script to support xgboost-cu13
 (#11705)

* [CI] Update release script to support xgboost-cu13

* Don't build sdist for xgboost-cu13; Remove NCCL dep from xgboost sdist
---
 ops/script/release_artifacts.py | 50 +++++++++++++--------------------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
index 0dfeff7924bc..ef05a71420ac 100644
--- a/ops/script/release_artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -109,36 +109,22 @@ def make_python_sdist(
     dist_dir = outdir / "dist"
     dist_dir.mkdir(exist_ok=True)
 
-    # Build sdist for `xgboost-cpu`.
-    with DirectoryExcursion(ROOT):
-        make_pyproject(use_suffix="cpu", require_nccl_dep="na")
-    with DirectoryExcursion(ROOT / "python-package"):
-        subprocess.run(["python", "-m", "build", "--sdist"], check=True)
-        sdist_name = (
-            f"xgboost_cpu-{release}{rc}{rc_ver}.tar.gz"
-            if rc
-            else f"xgboost_cpu-{release}.tar.gz"
-        )
-        src = DIST / sdist_name
-        subprocess.run(["twine", "check", str(src)], check=True)
-        dest = dist_dir / sdist_name
-        shutil.move(src, dest)
-
-    # Build sdist for `xgboost`.
-    with DirectoryExcursion(ROOT):
-        make_pyproject(use_suffix="na", require_nccl_dep="cu12")
-
-    with DirectoryExcursion(ROOT / "python-package"):
-        subprocess.run(["python", "-m", "build", "--sdist"], check=True)
-        sdist_name = (
-            f"xgboost-{release}{rc}{rc_ver}.tar.gz"
-            if rc
-            else f"xgboost-{release}.tar.gz"
-        )
-        src = DIST / sdist_name
-        subprocess.run(["twine", "check", str(src)], check=True)
-        dest = dist_dir / sdist_name
-        shutil.move(src, dest)
+    # Build sdist for `xgboost-cpu`, `xgboost`.
+    for suffix, nccl_dep in [("cpu", "na"), ("na", "na")]:
+        with DirectoryExcursion(ROOT):
+            make_pyproject(use_suffix=suffix, require_nccl_dep=nccl_dep)
+        with DirectoryExcursion(ROOT / "python-package"):
+            subprocess.run(["python", "-m", "build", "--sdist"], check=True)
+            pkg_name = "xgboost" if suffix == "na" else f"xgboost_{suffix}"
+            sdist_name = (
+                f"{pkg_name}-{release}{rc}{rc_ver}.tar.gz"
+                if rc
+                else f"{pkg_name}-{release}.tar.gz"
+            )
+            src = DIST / sdist_name
+            subprocess.run(["twine", "check", str(src)], check=True)
+            dest = dist_dir / sdist_name
+            shutil.move(src, dest)
 
     # Build stub package `xgboost-cu12`.
     with DirectoryExcursion(ROOT):
@@ -166,6 +152,9 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
         "macosx_10_15_x86_64",
         "macosx_12_0_arm64",
     ]
+    cu13_platforms = [
+        "manylinux_2_28_x86_64",
+    ]
     minimal_platforms = [
         "win_amd64",
         "win_arm64",
@@ -179,6 +168,7 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     for pkg_name, platforms in [
         ("xgboost", full_platforms),
         ("xgboost_cpu", minimal_platforms),
+        ("xgboost_cu13", cu13_platforms),
     ]:
         src_filename_prefix = f"{pkg_name}-{args.release}-py3-none-"
         target_filename_prefix = f"{pkg_name}-{args.release}-py3-none-"

From 8a4752f2cd6661ba4b3a5941dc7d18e90c7d034f Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 2 Oct 2025 09:39:46 -0700
Subject: [PATCH 184/224] [CI] Use macos-15-intel instead of unsupported
 macos-13 (#11721)

* [CI] Use macos-14 instead of unsupported macos-13

* macos-14-large is for Intel

* Use macos-15-intel, as macos-14-large isn't free

* [CI] Clarify job name
---
 .github/workflows/jvm_tests.yml           | 10 +++++-----
 .github/workflows/python_tests.yml        |  8 ++++----
 .github/workflows/python_wheels_macos.yml |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index a1b170c9d105..64a6642c6212 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -77,7 +77,7 @@ jobs:
           - description: "MacOS (Intel)"
             script: ops/pipeline/build-jvm-macos-intel.sh
             libname: libxgboost4j_intel.dylib
-            runner: macos-13
+            runner: macos-15-intel
     steps:
       - uses: actions/checkout@v4
         with:
@@ -132,12 +132,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, macos-13]
+        os: [windows-latest, macos-15-intel]
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: 'true'
-      - uses: actions/setup-java@v4
+      - uses: actions/setup-java@v5
         with:
           distribution: 'temurin'
           java-version: '8'
@@ -152,7 +152,7 @@ jobs:
           key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
           restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
       - name: Test XGBoost4J (Core) on macos
-        if: matrix.os == 'macos-13'
+        if: matrix.os == 'macos-15-intel'
         run: |
           cd jvm-packages
           mvn test -B -pl :xgboost4j_2.12 -Duse.openmp=OFF
@@ -205,7 +205,7 @@ jobs:
           SCALA_VERSION: ${{ matrix.scala_version }}
 
   deploy-jvm-packages:
-    name: Deploy JVM packages to S3 (${{ matrix.variant.name }})
+    name: Deploy JVM packages to S3 (${{ matrix.variant.name }}, Scala ${{ matrix.scala_version }})
     needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu]
     runs-on:
       - runs-on
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 180b1a855733..343bb899ed16 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [macos-13, windows-latest, ubuntu-latest]
+        os: [macos-15-intel, windows-latest, ubuntu-latest]
     steps:
       - uses: actions/checkout@v4
         with:
@@ -32,13 +32,13 @@ jobs:
       - name: Install extra package for MacOS
         run: |
           mamba install -c conda-forge llvm-openmp
-        if: matrix.os == 'macos-13'
+        if: matrix.os == 'macos-15-intel'
       - name: Build and install XGBoost
         run: bash ops/pipeline/test-python-sdist.sh
 
   python-tests-on-macos:
-    name: Test XGBoost Python package on macos-13
-    runs-on: macos-13
+    name: Test XGBoost Python package on macos-15-intel
+    runs-on: macos-15-intel
     timeout-minutes: 60
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml
index cbece0512274..14a37840c719 100644
--- a/.github/workflows/python_wheels_macos.yml
+++ b/.github/workflows/python_wheels_macos.yml
@@ -25,7 +25,7 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - os: macos-13
+        - os: macos-15-intel
           platform_id: macosx_x86_64
         - os: macos-14
           platform_id: macosx_arm64

From 587701662cad18ea12f51e3990e9081099041ff7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 7 Oct 2025 10:31:56 +0800
Subject: [PATCH 185/224] [mt] Use tree view for the CPU predictor. (#11724)

---
 src/gbm/gbtree_model.h         |  2 ++
 src/predictor/cpu_predictor.cc | 24 ++++++++++++------------
 src/predictor/predict_fn.h     |  4 +---
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index 7f418864f268..ddff8a7479e4 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -142,6 +142,8 @@ struct GBTreeModel : public Model {
   [[nodiscard]] std::shared_ptr<CatContainer> CatsShared() const { return this->cats_; }
   void Cats(std::shared_ptr<CatContainer> cats) { this->cats_ = cats; }
 
+  auto const* Ctx() const { return this->ctx_; }
+
  private:
   /**
    * @brief Categories in the training data.
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index b99beb88c7b4..c138e3300048 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -6,7 +6,6 @@
 #include <cstddef>    // for size_t
 #include <cstdint>    // for uint32_t, int32_t, uint64_t
 #include <memory>     // for unique_ptr, shared_ptr
-#include <ostream>    // for char_traits, operator<<, basic_ostream
 #include <vector>     // for vector
 
 #include "../collective/allreduce.h"          // for Allreduce
@@ -95,7 +94,7 @@ void PredValueByOneTree(const RegTree& tree,
 
 namespace multi {
 template <bool has_missing, bool has_categorical>
-bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
+bst_node_t GetLeafIndex(MultiTargetTreeView const &tree, const RegTree::FVec &feat,
                         RegTree::CategoricalSplitMatrix const &cats,
                         bst_node_t nidx) {
   while (!tree.IsLeaf(nidx)) {
@@ -108,7 +107,7 @@ bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
 }
 
 template <bool has_categorical>
-void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tree,
+void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTreeView const &tree,
                         RegTree::CategoricalSplitMatrix const &cats,
                         linalg::VectorView<float> out_predt, bst_node_t nidx) {
   bst_node_t const leaf = p_feats.HasMissing()
@@ -122,10 +121,10 @@ void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tre
 }
 
 template <bool has_categorical, bool any_missing, bool use_array_tree_layout>
-void PredValueByOneTree(const RegTree &tree, std::size_t const predict_offset,
+void PredValueByOneTree(Context const *ctx, const RegTree &tree, std::size_t const predict_offset,
                         common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
                         linalg::MatrixView<float> out_predt, bst_node_t *p_nidx, bst_node_t depth) {
-  const auto &mt_tree = *(tree.GetMultiTargetTree());
+  const auto mt_tree = tree.GetMultiTargetTree()->View(ctx);
   auto const &cats = tree.GetCategoriesMatrix();
   if constexpr (use_array_tree_layout) {
     ProcessArrayTree<has_categorical, any_missing>(tree, cats, fvec_tloc, block_size, p_nidx,
@@ -149,7 +148,7 @@ void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree
                             bst_tree_t const tree_end, std::size_t const predict_offset,
                             common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
                             linalg::MatrixView<float> out_predt,
-                            const std::vector<int>& tree_depth) {
+                            const std::vector<int> &tree_depth) {
   std::vector<bst_node_t> nidx;
   if constexpr (use_array_tree_layout) {
     nidx.resize(block_size, 0);
@@ -160,12 +159,13 @@ void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree
 
     int depth = use_array_tree_layout ? tree_depth[tree_id - tree_begin] : 0;
     if (tree.IsMultiTarget()) {
+      auto ctx = model.Ctx();
       if (has_categorical) {
-        multi::PredValueByOneTree<true, any_missing, use_array_tree_layout>
-          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
+        multi::PredValueByOneTree<true, any_missing, use_array_tree_layout>(
+            ctx, tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
       } else {
-        multi::PredValueByOneTree<false, any_missing, use_array_tree_layout>
-          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
+        multi::PredValueByOneTree<false, any_missing, use_array_tree_layout>(
+            ctx, tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
       }
     } else {
       auto const gid = model.tree_info[tree_id];
@@ -1065,8 +1065,8 @@ class CPUPredictor : public Predictor {
             auto const &cats = tree.GetCategoriesMatrix();
             bst_node_t nidx = 0;
             if (tree.IsMultiTarget()) {
-              nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), fvec_tloc.front(),
-                                                     cats, nidx);
+              nidx = multi::GetLeafIndex<true, true>(tree.GetMultiTargetTree()->View(this->ctx_),
+                                                     fvec_tloc.front(), cats, nidx);
             } else {
               nidx = scalar::GetLeafIndex<true, true>(tree, fvec_tloc.front(), cats, nidx);
             }
diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h
index 2cd8078ef4c3..81198ef60bbb 100644
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -8,8 +8,6 @@
 #include <vector>  // for vector
 
 #include "../common/categorical.h"  // for IsCat, Decision
-#include "../data/adapter.h"        // for COOTuple
-#include "../data/cat_container.h"  // for CatAccessor
 #include "xgboost/tree_model.h"     // for RegTree
 
 namespace xgboost::predictor {
@@ -37,7 +35,7 @@ XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_
 }
 
 template <bool has_missing, bool has_categorical>
-XGBOOST_DEVICE bst_node_t GetNextNodeMulti(MultiTargetTree const &tree, bst_node_t const nidx,
+XGBOOST_DEVICE bst_node_t GetNextNodeMulti(MultiTargetTreeView const &tree, bst_node_t const nidx,
                                            float fvalue, bool is_missing,
                                            RegTree::CategoricalSplitMatrix const &cats) {
   if (has_missing && is_missing) {

From 495553b706a171857efeda91f4b896e9085393a3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 9 Oct 2025 02:06:11 +0800
Subject: [PATCH 186/224] [mt] Initial work on vector leaf for the GPU
 predictor. (#11729)

---
 src/predictor/cpu_predictor.cc            |   1 -
 src/predictor/gpu_predictor.cu            | 140 +++++++++++++++++-----
 tests/cpp/predictor/test_cpu_predictor.cc |   1 -
 tests/cpp/predictor/test_gpu_predictor.cu |   5 +
 tests/cpp/predictor/test_predictor.cc     |  89 +++++++-------
 5 files changed, 158 insertions(+), 78 deletions(-)

diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index c138e3300048..97b873c0c130 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -460,7 +460,6 @@ struct LaunchConfig : public Args... {
       }
     } else {
       for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
-        // bool any_missing = !page.IsDense();
         fn(SparsePageView{page.GetView(), page.base_rowid, acc});
       }
     }
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 3f8de403637f..5f962d399a1a 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -27,6 +27,7 @@
 #include "utils.h"  // for CheckProxyDMatrix
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
+#include "xgboost/multi_target_tree_model.h"  // for MultiTargetTree, MultiTargetTreeView
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
@@ -243,6 +244,55 @@ struct DeviceAdapterLoader {
   }
 };
 
+namespace multi {
+template <bool has_missing, bool has_categorical>
+XGBOOST_DEVICE bst_node_t GetNextNode(MultiTargetTreeView const& tree, bst_node_t const nidx,
+                                      float fvalue, bool is_missing) {
+  if (has_missing && is_missing) {
+    return tree.DefaultChild(nidx);
+  } else {
+    return fvalue < tree.SplitCond(nidx) ? tree.LeftChild(nidx) : tree.RightChild(nidx);
+  }
+}
+
+template <bool has_missing, bool has_categorical, typename Loader>
+__device__ bst_node_t GetLeafIndex(bst_idx_t ridx, MultiTargetTreeView const& tree,
+                                   Loader* loader) {
+  bst_node_t nidx = 0;
+  while (!tree.IsLeaf(nidx)) {
+    float fvalue = loader->GetElement(ridx, tree.SplitIndex(nidx));
+    bool is_missing = common::CheckNAN(fvalue);
+    auto next = GetNextNode<has_missing, has_categorical>(tree, nidx, fvalue, is_missing);
+    assert(nidx < next);
+    nidx = next;
+  }
+  return nidx;
+}
+
+template <bool has_missing, typename Loader>
+__device__ auto GetLeafWeight(bst_idx_t ridx, MultiTargetTreeView const& tree, Loader* loader) {
+  bst_node_t nidx = GetLeafIndex<has_missing, false>(ridx, tree, loader);
+  return tree.LeafValue(nidx);
+}
+
+template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
+__global__ void PredictKernel(Data data, common::Span<MultiTargetTreeView> trees, bool use_shared,
+                              float missing, linalg::MatrixView<float> d_out_predt,
+                              EncAccessor acc) {
+  for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), data.NumRows())) {
+    Loader loader{std::move(data), use_shared, static_cast<bst_feature_t>(data.NumCols()),
+                  data.NumRows(),  missing,    std::move(acc)};
+    for (auto const& tree : trees) {
+      auto leaf = GetLeafWeight<has_missing>(idx, tree, &loader);
+      for (std::size_t i = 0, n = leaf.Shape(0); i < n; ++i) {
+        d_out_predt(idx, i) += leaf(i);
+      }
+    }
+  }
+}
+}  // namespace multi
+
+namespace scalar {
 template <bool has_missing, bool has_categorical, typename Loader>
 __device__ bst_node_t GetLeafIndex(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
   bst_node_t nidx = 0;
@@ -257,8 +307,7 @@ __device__ bst_node_t GetLeafIndex(bst_idx_t ridx, TreeView const& tree, Loader*
 }
 
 template <bool has_missing, typename Loader>
-__device__ float GetLeafWeight(bst_idx_t ridx, TreeView const &tree,
-                               Loader *loader) {
+__device__ float GetLeafWeight(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
   bst_node_t nidx = -1;
   if (tree.HasCategoricalSplit()) {
     nidx = GetLeafIndex<has_missing, true>(ridx, tree, loader);
@@ -267,6 +316,7 @@ __device__ float GetLeafWeight(bst_idx_t ridx, TreeView const &tree,
   }
   return tree.d_tree[nidx].LeafValue();
 }
+}  // namespace scalar
 
 template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
 __global__ void
@@ -295,9 +345,9 @@ PredictLeafKernel(Data data, common::Span<const RegTree::Node> d_nodes,
 
     bst_node_t leaf = -1;
     if (d_tree.HasCategoricalSplit()) {
-      leaf = GetLeafIndex<has_missing, true>(ridx, d_tree, &loader);
+      leaf = scalar::GetLeafIndex<has_missing, true>(ridx, d_tree, &loader);
     } else {
-      leaf = GetLeafIndex<has_missing, false>(ridx, d_tree, &loader);
+      leaf = scalar::GetLeafIndex<has_missing, false>(ridx, d_tree, &loader);
     }
     d_out_predictions[ridx * (tree_end - tree_begin) + tree_idx] = leaf;
   }
@@ -313,7 +363,7 @@ PredictKernel(Data data, common::Span<const RegTree::Node> d_nodes,
               common::Span<uint32_t const> d_cat_tree_segments,
               common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
               common::Span<uint32_t const> d_categories, bst_tree_t tree_begin,
-              bst_tree_t tree_end, bst_feature_t num_features, size_t num_rows,
+              bst_tree_t tree_end, bst_feature_t num_features, bst_idx_t num_rows,
               bool use_shared, int num_group, float missing, EncAccessor acc) {
   bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x;
   Loader loader{std::move(data), use_shared, num_features, num_rows, missing, std::move(acc)};
@@ -326,20 +376,19 @@ PredictKernel(Data data, common::Span<const RegTree::Node> d_nodes,
           tree_begin,          tree_idx,           d_nodes,
           d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
           d_cat_node_segments, d_categories};
-      float leaf = GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
+      float leaf = scalar::GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
       sum += leaf;
     }
     d_out_predictions[global_idx] += sum;
   } else {
     for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
       int tree_group = d_tree_group[tree_idx];
-      TreeView d_tree{
-          tree_begin,          tree_idx,           d_nodes,
-          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-          d_cat_node_segments, d_categories};
+      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                      d_cat_node_segments, d_categories};
       bst_uint out_prediction_idx = global_idx * num_group + tree_group;
       d_out_predictions[out_prediction_idx] +=
-          GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
+          scalar::GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
     }
   }
 }
@@ -400,12 +449,12 @@ class DeviceModel {
     for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
       auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
       auto& src_stats = model.trees.at(tree_idx)->GetStats();
-      dh::safe_cuda(cudaMemcpyAsync(
-          d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
-          sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
-      dh::safe_cuda(cudaMemcpyAsync(
-          d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
-          sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(d_nodes + h_tree_segments[tree_idx - tree_begin],
+                                    src_nodes.data(), sizeof(RegTree::Node) * src_nodes.size(),
+                                    cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(d_stats + h_tree_segments[tree_idx - tree_begin],
+                                    src_stats.data(), sizeof(RTreeNodeStat) * src_stats.size(),
+                                    cudaMemcpyDefault));
     }
 
     tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
@@ -424,14 +473,13 @@ class DeviceModel {
 
     categories = HostDeviceVector<uint32_t>({}, device);
     categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
-    std::vector<uint32_t> &h_categories = categories.HostVector();
-    std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
+    std::vector<uint32_t>& h_categories = categories.HostVector();
+    std::vector<uint32_t>& h_split_cat_segments = categories_tree_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
       auto const& src_cats = model.trees.at(tree_idx)->GetSplitCategories();
       size_t orig_size = h_categories.size();
       h_categories.resize(orig_size + src_cats.size());
-      std::copy(src_cats.cbegin(), src_cats.cend(),
-                h_categories.begin() + orig_size);
+      std::copy(src_cats.cbegin(), src_cats.cend(), h_categories.begin() + orig_size);
       h_split_cat_segments.push_back(h_categories.size());
     }
 
@@ -974,7 +1022,7 @@ class LaunchConfig {
   void LaunchPredict(Context const* ctx, Data data, float missing, bst_idx_t n_samples,
                      bst_feature_t n_features, DeviceModel const& model, bool is_dense,
                      enc::DeviceColumnsView const& new_enc, bst_idx_t batch_offset,
-                     HostDeviceVector<bst_float>* predictions) const {
+                     HostDeviceVector<float>* predictions) const {
     LaunchPredictKernel(ctx, is_dense, new_enc, model, [&](auto is_dense, auto&& acc) {
       constexpr bool kHasMissing = !std::is_same_v<decltype(is_dense), std::true_type>;
       using EncAccessor = std::remove_reference_t<decltype(acc)>;
@@ -993,10 +1041,30 @@ class LaunchConfig {
     });
   }
 
+  template <template <typename> typename Loader, typename Data>
+  void LaunchMultiPredict(Context const* ctx, Data data, gbm::GBTreeModel const& model,
+                          float missing, bst_tree_t tree_begin, bst_tree_t tree_end,
+                          bst_idx_t batch_offset, HostDeviceVector<float>* predictions) const {
+    CHECK_EQ(batch_offset, 0);  // External memory is not supported yet.
+    CHECK_GT(tree_end, tree_begin);
+    std::vector<MultiTargetTreeView> h_trees;
+    for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
+      h_trees.emplace_back(model.trees[tree_idx]->GetMultiTargetTree()->View(ctx));
+    }
+    dh::device_vector<MultiTargetTreeView> trees = h_trees;
+    CHECK_GE(predictions->Size(), data.NumRows() * h_trees.front().NumTargets());
+    auto kernel = multi::PredictKernel<Loader<NoOpAccessor>, Data, true, NoOpAccessor>;
+    auto predt =
+        linalg::MakeTensorView(ctx, predictions, data.NumRows(), h_trees.front().NumTargets());
+    this->Grid(data.NumRows())
+        .LaunchImpl(std::move(kernel), std::move(data), dh::ToSpan(trees), this->UseShared(),
+                    missing, predt, NoOpAccessor{});
+  }
+
   template <template <typename> typename Loader, typename Data>
   void LaunchLeaf(Context const* ctx, Data data, bst_idx_t n_samples, bst_feature_t n_features,
                   DeviceModel const& model, bool is_dense, enc::DeviceColumnsView const& new_enc,
-                  bst_idx_t batch_offset, HostDeviceVector<bst_float>* predictions) const {
+                  bst_idx_t batch_offset, HostDeviceVector<float>* predictions) const {
     LaunchPredictKernel(ctx, is_dense, new_enc, model, [&](auto is_dense, auto&& acc) {
       constexpr bool kHasMissing = !std::is_same_v<decltype(is_dense), std::true_type>;
       using EncAccessor = std::remove_reference_t<decltype(acc)>;
@@ -1037,7 +1105,9 @@ class GPUPredictor : public xgboost::Predictor {
     out_preds->SetDevice(ctx_->Device());
     auto const& info = p_fmat->Info();
     DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, ctx_->Device());
+    if (!model.trees[tree_begin]->IsMultiTarget()) {
+      d_model.Init(model, tree_begin, tree_end, ctx_->Device());
+    }
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictBatch(p_fmat, out_preds, model, d_model);
@@ -1056,9 +1126,15 @@ class GPUPredictor : public xgboost::Predictor {
         auto n_features = model.learner_model_param->num_feature;
         LaunchConfig cfg{ctx_, n_features};
         SparsePageView data(page.data.DeviceSpan(), page.offset.DeviceSpan(), n_features);
-        cfg.LaunchPredict<SparsePageLoader>(
-            this->ctx_, std::move(data), std::numeric_limits<float>::quiet_NaN(), page.Size(),
-            n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
+        if (model.trees[tree_begin]->IsMultiTarget()) {
+          cfg.LaunchMultiPredict<SparsePageLoader>(this->ctx_, std::move(data), model,
+                                                   std::numeric_limits<float>::quiet_NaN(),
+                                                   tree_begin, tree_end, batch_offset, out_preds);
+        } else {
+          cfg.LaunchPredict<SparsePageLoader>(
+              this->ctx_, std::move(data), std::numeric_limits<float>::quiet_NaN(), page.Size(),
+              n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
+        }
         batch_offset += page.Size() * model.learner_model_param->OutputLength();
       }
     } else {
@@ -1158,7 +1234,7 @@ class GPUPredictor : public xgboost::Predictor {
 
   void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
                            const gbm::GBTreeModel& model, bst_tree_t tree_end,
-                           std::vector<bst_float> const* tree_weights, bool approximate, int,
+                           std::vector<float> const* tree_weights, bool approximate, int,
                            unsigned) const override {
     StringView not_implemented{
         "contribution is not implemented in the GPU predictor, use CPU instead."};
@@ -1177,8 +1253,7 @@ class GPUPredictor : public xgboost::Predictor {
     const int ngroup = model.learner_model_param->num_output_group;
     CHECK_NE(ngroup, 0);
     // allocate space for (number of features + bias) times the number of rows
-    size_t contributions_columns =
-        model.learner_model_param->num_feature + 1;  // +1 for bias
+    size_t contributions_columns = model.learner_model_param->num_feature + 1;  // +1 for bias
     auto dim_size = contributions_columns * model.learner_model_param->num_output_group;
     out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
     out_contribs->Fill(0.0f);
@@ -1245,8 +1320,8 @@ class GPUPredictor : public xgboost::Predictor {
                                        gbm::GBTreeModel const& model, bst_tree_t tree_end,
                                        std::vector<float> const* tree_weights,
                                        bool approximate) const override {
-    std::string not_implemented{"contribution is not implemented in GPU "
-                                "predictor, use `cpu_predictor` instead."};
+    std::string not_implemented{
+        "contribution is not implemented in GPU predictor, use cpu instead."};
     if (approximate) {
       LOG(FATAL) << "Approximated " << not_implemented;
     }
@@ -1333,7 +1408,6 @@ class GPUPredictor : public xgboost::Predictor {
                    gbm::GBTreeModel const& model, bst_tree_t tree_end) const override {
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
 
-
     const MetaInfo& info = p_fmat->Info();
     bst_idx_t n_samples = info.num_row_;
     tree_end = GetTreeLimit(model.trees, tree_end);
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index bc9df71c7641..caa177509426 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -279,7 +279,6 @@ TEST(CpuPredictor, SparseColumnSplit) {
 
 TEST(CpuPredictor, Multi) {
   Context ctx;
-  ctx.nthread = 1;
   TestVectorLeafPrediction(&ctx);
 }
 
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index b66610c108af..98f487780de6 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -351,6 +351,11 @@ TEST(GPUPredictor, PredictLeafBasic) {
   }
 }
 
+TEST(GPUPredictor, Multi) {
+  auto ctx = MakeCUDACtx(0);
+  TestVectorLeafPrediction(&ctx);
+}
+
 TEST(GPUPredictor, Sparse) {
   auto ctx = MakeCUDACtx(0);
   TestSparsePrediction(&ctx, 0.2);
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 0565dac4f621..6a822e910375 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -731,14 +731,14 @@ void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsit
 }
 
 void TestVectorLeafPrediction(Context const *ctx) {
-  std::unique_ptr<Predictor> cpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", ctx));
+  std::unique_ptr<Predictor> predictor{ctx->IsCUDA() ? Predictor::Create("gpu_predictor", ctx)
+                                                     : Predictor::Create("cpu_predictor", ctx)};
 
   size_t constexpr kRows = 5;
   size_t constexpr kCols = 5;
 
   LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
-                           linalg::Vector<float>{{0.5}, {1}, DeviceOrd::CPU()}, 1, 3,
+                           linalg::Vector<float>{{0.5}, {1}, ctx->Device()}, 1, 3,
                            MultiStrategy::kMultiOutputTree};
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -758,73 +758,76 @@ void TestVectorLeafPrediction(Context const *ctx) {
   gbm::GBTreeModel model{&mparam, ctx};
   model.CommitModelGroup(std::move(trees), 0);
 
-  auto run_test = [&](float expected, HostDeviceVector<float> *p_data) {
-    {
+  auto test_batch = [&](float expected, HostDeviceVector<float> const*p_data) {
       auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
       PredictionCacheEntry predt_cache;
-      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
       ASSERT_EQ(predt_cache.predictions.Size(), kRows * mparam.LeafLength());
-      cpu_predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
+      predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
       auto const &h_predt = predt_cache.predictions.HostVector();
       for (auto v : h_predt) {
         ASSERT_EQ(v, expected);
       }
-    }
-
-    {
-      // inplace
+  };
+  auto test_inplace = [&](float expected, HostDeviceVector<float> const*p_data) {
       PredictionCacheEntry predt_cache;
       auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
-      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
       auto arr = GetArrayInterface(p_data, kRows, kCols);
       std::string str;
       Json::Dump(arr, &str);
       auto proxy = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
       dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetArray(str.data());
-      cpu_predictor->InplacePredict(proxy, model, std::numeric_limits<float>::quiet_NaN(),
-                                    &predt_cache, 0, 1);
+      predictor->InplacePredict(proxy, model, std::numeric_limits<float>::quiet_NaN(), &predt_cache,
+                                0, 1);
       auto const &h_predt = predt_cache.predictions.HostVector();
       for (auto v : h_predt) {
         ASSERT_EQ(v, expected);
       }
+  };
+  auto test_ghist = [&](float expected, HostDeviceVector<float> *p_data) {
+    // ghist
+    PredictionCacheEntry predt_cache;
+    auto &h_data = p_data->HostVector();
+    // give it at least two bins, otherwise the histogram cuts only have min and max values.
+    for (std::size_t i = 0; i < kCols; ++i) {
+      h_data[i] = 1.0;
     }
-
-    {
-      // ghist
-      PredictionCacheEntry predt_cache;
-      auto &h_data = p_data->HostVector();
-      // give it at least two bins, otherwise the histogram cuts only have min and max values.
-      for (std::size_t i = 0; i < 5; ++i) {
-        h_data[i] = 1.0;
-      }
-      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
-
-      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
-
-      auto iter = NumpyArrayIterForTest{ctx, *p_data, kRows, static_cast<bst_feature_t>(kCols),
-                                        static_cast<std::size_t>(1)};
-      p_fmat = std::make_shared<data::IterativeDMatrix>(
-          &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0,
-          256, std::numeric_limits<std::int64_t>::max());
-
-      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
-      cpu_predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
-      auto const &h_predt = predt_cache.predictions.HostVector();
-      // the smallest v uses the min_value from histogram cuts, which leads to a left leaf
-      // during prediction.
-      for (std::size_t i = 5; i < h_predt.size(); ++i) {
-        ASSERT_EQ(h_predt[i], expected) << i;
-      }
+    auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+
+    predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+
+    auto iter = NumpyArrayIterForTest{ctx, *p_data, kRows, static_cast<bst_feature_t>(kCols),
+                                      static_cast<std::size_t>(1)};
+    p_fmat = std::make_shared<data::IterativeDMatrix>(
+        &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, 256,
+        std::numeric_limits<std::int64_t>::max());
+
+    predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+    predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
+    auto const &h_predt = predt_cache.predictions.HostVector();
+    // the smallest v uses the min_value from histogram cuts, which leads to a left leaf
+    // during prediction.
+    for (std::size_t i = 5; i < h_predt.size(); ++i) {
+      ASSERT_EQ(h_predt[i], expected) << i;
     }
   };
 
   // go to right
   HostDeviceVector<float> data(kRows * kCols, model.trees.front()->SplitCond(RegTree::kRoot) + 1.0);
-  run_test(2.5, &data);
+  test_batch(2.5, &data);
+  if (!ctx->IsCUDA()) {
+    test_inplace(2.5, &data);
+    test_ghist(2.5, &data);
+  }
 
   // go to left
   data.HostVector().assign(data.Size(), model.trees.front()->SplitCond(RegTree::kRoot) - 1.0);
-  run_test(1.5, &data);
+  test_batch(1.5, &data);
+  if (!ctx->IsCUDA()) {
+    test_inplace(1.5, &data);
+    test_ghist(1.5, &data);
+  }
 }
 
 void ShapExternalMemoryTest::Run(Context const *ctx, bool is_qdm, bool is_interaction) {

From 15cb25d9565b472e15983b1238c86cc319ca085e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 9 Oct 2025 12:04:52 +0800
Subject: [PATCH 187/224] Remove the deprecated CLI. (#11720)

---
 .github/workflows/cuda13.yml                  |    6 +-
 .github/workflows/main.yml                    |   14 +-
 .github/workflows/windows.yml                 |    5 +-
 CMakeLists.txt                                |   57 +-
 demo/CLI/README.rst                           |    4 -
 demo/CLI/binary_classification/README.md      |  163 -
 .../agaricus-lepiota.data                     | 8124 -----------------
 .../agaricus-lepiota.fmap                     |   32 -
 .../agaricus-lepiota.names                    |  148 -
 demo/CLI/binary_classification/mapfeat.py     |   47 -
 demo/CLI/binary_classification/mknfold.py     |   29 -
 demo/CLI/binary_classification/mushroom.conf  |   29 -
 demo/CLI/binary_classification/runexp.sh      |   17 -
 demo/CLI/distributed-training/README.md       |   27 -
 .../distributed-training/mushroom.aws.conf    |   27 -
 .../CLI/distributed-training/plot_model.ipynb |  107 -
 demo/CLI/distributed-training/run_aws.sh      |   11 -
 demo/CLI/regression/README.md                 |   16 -
 demo/CLI/regression/machine.conf              |   28 -
 demo/CLI/regression/runexp.sh                 |   16 -
 demo/CLI/yearpredMSD/README.md                |    9 -
 demo/CLI/yearpredMSD/csv2libsvm.py            |   12 -
 demo/CLI/yearpredMSD/runexp.sh                |   17 -
 demo/CLI/yearpredMSD/yearpredMSD.conf         |   29 -
 demo/README.md                                |   15 +-
 demo/data/regression/README.md                |    4 +
 demo/{CLI => data}/regression/machine.data    |    0
 demo/{CLI => data}/regression/machine.names   |    0
 demo/{CLI => data}/regression/mapfeat.py      |    0
 demo/{CLI => data}/regression/mknfold.py      |    0
 dev/prepare_jvm_release.py                    |    8 +-
 doc/cli.rst                                   |    5 -
 doc/index.rst                                 |    1 -
 doc/parameter.rst                             |   64 +-
 jvm-packages/create_jni.py                    |    4 +-
 ops/pipeline/build-cpu-impl.sh                |    2 -
 ops/pipeline/build-cuda-impl.sh               |    1 -
 .../build-python-wheels-arm64-impl.sh         |    3 +-
 ops/pipeline/build-test-cpu-nonomp.sh         |    3 +-
 ops/pipeline/build-win64-gpu.ps1              |    2 +-
 ops/pipeline/test-python-macos.sh             |    2 +-
 ops/script/inject_jvm_lib.sh                  |    7 +-
 src/cli_main.cc                               |  520 --
 src/common/config.h                           |  174 -
 tests/cli/machine.conf.in                     |   13 -
 tests/cpp/common/test_config.cc               |  171 -
 tests/python/test_cli.py                      |  195 -
 tests/python/test_demos.py                    |   87 +-
 48 files changed, 61 insertions(+), 10194 deletions(-)
 delete mode 100644 demo/CLI/README.rst
 delete mode 100644 demo/CLI/binary_classification/README.md
 delete mode 100644 demo/CLI/binary_classification/agaricus-lepiota.data
 delete mode 100644 demo/CLI/binary_classification/agaricus-lepiota.fmap
 delete mode 100644 demo/CLI/binary_classification/agaricus-lepiota.names
 delete mode 100755 demo/CLI/binary_classification/mapfeat.py
 delete mode 100755 demo/CLI/binary_classification/mknfold.py
 delete mode 100644 demo/CLI/binary_classification/mushroom.conf
 delete mode 100755 demo/CLI/binary_classification/runexp.sh
 delete mode 100644 demo/CLI/distributed-training/README.md
 delete mode 100644 demo/CLI/distributed-training/mushroom.aws.conf
 delete mode 100644 demo/CLI/distributed-training/plot_model.ipynb
 delete mode 100644 demo/CLI/distributed-training/run_aws.sh
 delete mode 100644 demo/CLI/regression/README.md
 delete mode 100644 demo/CLI/regression/machine.conf
 delete mode 100755 demo/CLI/regression/runexp.sh
 delete mode 100644 demo/CLI/yearpredMSD/README.md
 delete mode 100755 demo/CLI/yearpredMSD/csv2libsvm.py
 delete mode 100755 demo/CLI/yearpredMSD/runexp.sh
 delete mode 100644 demo/CLI/yearpredMSD/yearpredMSD.conf
 create mode 100644 demo/data/regression/README.md
 rename demo/{CLI => data}/regression/machine.data (100%)
 rename demo/{CLI => data}/regression/machine.names (100%)
 rename demo/{CLI => data}/regression/mapfeat.py (100%)
 rename demo/{CLI => data}/regression/mknfold.py (100%)
 delete mode 100644 doc/cli.rst
 delete mode 100644 src/cli_main.cc
 delete mode 100644 src/common/config.h
 delete mode 100644 tests/cli/machine.conf.in
 delete mode 100644 tests/cpp/common/test_config.cc
 delete mode 100644 tests/python/test_cli.py

diff --git a/.github/workflows/cuda13.yml b/.github/workflows/cuda13.yml
index ff133ed1eeb9..5ea448f25cce 100644
--- a/.github/workflows/cuda13.yml
+++ b/.github/workflows/cuda13.yml
@@ -35,7 +35,7 @@ jobs:
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/build-cuda13 \
-            build/testxgboost ./xgboost python-package/dist/*.whl
+            build/testxgboost python-package/dist/*.whl
   test-cpp-cuda13:
     name: Google Test (C++) with CUDA 13
     needs: [build-cuda13]
@@ -82,8 +82,6 @@ jobs:
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/build-cuda13 \
             --dest-dir wheelhouse \
-            *.whl xgboost
-          mv -v wheelhouse/xgboost .
-          chmod +x ./xgboost
+            *.whl
       - name: Run Python tests
         run: bash ops/pipeline/test-python-wheel-cuda13.sh
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index cdd040acad4c..11fb4ff0a7df 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,12 +29,6 @@ jobs:
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
       - run: bash ops/pipeline/build-cpu.sh
-      - name: Stash CLI executable
-        run: |
-          python3 ops/pipeline/manage-artifacts.py upload \
-            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cpu \
-            ./xgboost
 
   build-cuda:
     name: Build CUDA + manylinux_2_28_x86_64 wheel
@@ -57,7 +51,7 @@ jobs:
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/build-cuda \
-            build/testxgboost ./xgboost python-package/dist/*.whl
+            build/testxgboost python-package/dist/*.whl
 
   build-cuda-with-rmm:
     name: Build CUDA with RMM
@@ -120,7 +114,7 @@ jobs:
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/build-python-wheels-arm64 \
-            ./xgboost python-package/dist/*.whl
+            python-package/dist/*.whl
 
   build-python-wheels-cpu:
     name: Build CPU wheel for ${{ matrix.manylinux_target }}_${{ matrix.arch }}
@@ -262,8 +256,6 @@ jobs:
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
             --dest-dir wheelhouse \
-            *.whl xgboost
-          mv -v wheelhouse/xgboost .
-          chmod +x ./xgboost
+            *.whl
       - name: Run Python tests, ${{ matrix.description }}
         run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.image_repo }}
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 10d430c5f0f8..0dd950ba2955 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -36,7 +36,7 @@ jobs:
           python ops/pipeline/manage-artifacts.py upload `
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} `
             --prefix cache/${{ github.run_id }}/build-win64-gpu `
-            build/testxgboost.exe xgboost.exe `
+            build/testxgboost.exe `
             (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName)
 
   build-win64-cpu:
@@ -70,8 +70,7 @@ jobs:
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} `
             --prefix cache/${{ github.run_id }}/build-win64-gpu `
             --dest-dir build `
-            *.whl testxgboost.exe xgboost.exe
-          Move-Item -Path build/xgboost.exe -Destination .
+            *.whl testxgboost.exe
           New-Item -ItemType Directory -Path python-package/dist/ -Force
           Move-Item -Path (Get-ChildItem build/*.whl | Select-Object -Expand FullName) `
             -Destination python-package/dist/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59ff1e165ba7..a79238d7cf69 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -207,6 +207,9 @@ endif()
 if(PLUGIN_DENSE_PARSER)
   message(SEND_ERROR "The option `PLUGIN_DENSE_PARSER` has been removed from XGBoost.")
 endif()
+if(BUILD_DEPRECATED_CLI)
+  message(SEND_ERROR  "The option `BUILD_DEPRECATED_CLI` is removed from XGBoost.")
+endif()
 
 #-- Sanitizer
 if(USE_SANITIZER)
@@ -384,28 +387,6 @@ target_include_directories(xgboost
   $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include>)
 #-- End shared library
 
-#-- CLI for xgboost
-if(BUILD_DEPRECATED_CLI)
-  add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
-  target_link_libraries(runxgboost PRIVATE objxgboost)
-  target_include_directories(runxgboost
-    PRIVATE
-    ${xgboost_SOURCE_DIR}/include
-    ${xgboost_SOURCE_DIR}/dmlc-core/include
-  )
-  set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
-  xgboost_target_properties(runxgboost)
-  xgboost_target_link_libraries(runxgboost)
-  xgboost_target_defs(runxgboost)
-
-  if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
-    set_output_directory(runxgboost ${xgboost_BINARY_DIR})
-  else()
-    set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
-  endif()
-endif()
-#-- End CLI for xgboost
-
 # Common setup for all targets
 foreach(target xgboost objxgboost dmlc)
   xgboost_target_properties(${target})
@@ -429,11 +410,6 @@ else()
   set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
 endif()
 
-# Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
-if(BUILD_DEPRECATED_CLI)
-  add_dependencies(xgboost runxgboost)
-endif()
-
 #-- Installing XGBoost
 if(R_LIB)
   include(cmake/RPackageInstallTargetSetup.cmake)
@@ -468,17 +444,9 @@ install(DIRECTORY ${xgboost_SOURCE_DIR}/include/xgboost
 #
 # https://github.com/dmlc/xgboost/issues/6085
 if(BUILD_STATIC_LIB)
-  if(BUILD_DEPRECATED_CLI)
-    set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
-  else()
-    set(INSTALL_TARGETS xgboost objxgboost dmlc)
-  endif()
+  set(INSTALL_TARGETS xgboost objxgboost dmlc)
 else()
-  if(BUILD_DEPRECATED_CLI)
-    set(INSTALL_TARGETS xgboost runxgboost)
-  else()
-    set(INSTALL_TARGETS xgboost)
-  endif()
+  set(INSTALL_TARGETS xgboost)
 endif()
 
 install(TARGETS ${INSTALL_TARGETS}
@@ -523,21 +491,6 @@ if(GOOGLE_TEST)
     NAME TestXGBoostLib
     COMMAND testxgboost
     WORKING_DIRECTORY ${xgboost_BINARY_DIR})
-  # CLI tests
-  configure_file(
-    ${xgboost_SOURCE_DIR}/tests/cli/machine.conf.in
-    ${xgboost_BINARY_DIR}/tests/cli/machine.conf
-    @ONLY
-    NEWLINE_STYLE UNIX)
-  if(BUILD_DEPRECATED_CLI)
-    add_test(
-      NAME TestXGBoostCLI
-      COMMAND runxgboost ${xgboost_BINARY_DIR}/tests/cli/machine.conf
-      WORKING_DIRECTORY ${xgboost_BINARY_DIR})
-    set_tests_properties(TestXGBoostCLI
-      PROPERTIES
-      PASS_REGULAR_EXPRESSION ".*test-rmse:0.087.*")
-  endif()
 endif()
 
 # Add xgboost.pc
diff --git a/demo/CLI/README.rst b/demo/CLI/README.rst
deleted file mode 100644
index e828cd8aeb5a..000000000000
--- a/demo/CLI/README.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-XGBoost Command Line Interface Walkthrough
-==========================================
-
-Please note that the command line interface is deprecated in 2.1.0, use other language bindings instead. For a list of available bindings, see https://xgboost.readthedocs.io/en/stable/
diff --git a/demo/CLI/binary_classification/README.md b/demo/CLI/binary_classification/README.md
deleted file mode 100644
index fec41b2bfe33..000000000000
--- a/demo/CLI/binary_classification/README.md
+++ /dev/null
@@ -1,163 +0,0 @@
-Binary Classification
-=====================
-This is the quick start tutorial for xgboost CLI version.
-Here we demonstrate how to use XGBoost for a binary classification task. Before getting started, make sure you compile xgboost in the root directory of the project by typing ```make```.
-The script 'runexp.sh' can be used to run the demo. Here we use [mushroom dataset](https://archive.ics.uci.edu/ml/datasets/Mushroom) from UCI machine learning repository.
-
-### Tutorial
-#### Generate Input Data
-XGBoost takes LIBSVM format. An example of faked input data is below:
-```
-1 101:1.2 102:0.03
-0 1:2.1 10001:300 10002:400
-...
-```
-Each line represent a single instance, and in the first line '1' is the instance label,'101' and '102' are feature indices, '1.2' and '0.03' are feature values. In the binary classification case, '1' is used to indicate positive samples, and '0' is used to indicate negative samples. We also support probability values in [0,1] as label, to indicate the probability of the instance being positive.
-
-
-First we will transform the dataset into classic LIBSVM format and split the data into training set and test set by running:
-```
-python mapfeat.py
-python mknfold.py agaricus.txt 1
-```
-The two files, 'agaricus.txt.train' and 'agaricus.txt.test' will be used as training set and test set.
-
-#### Training
-Then we can run the training process:
-```
-../../xgboost mushroom.conf
-```
-
-mushroom.conf is the configuration for both training and testing. Each line containing the [attribute]=[value] configuration:
-
-```conf
-# General Parameters, see comment for each definition
-# can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0
-# minimum loss reduction required to make a further partition
-gamma = 1.0
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1
-# maximum depth of a tree
-max_depth = 3
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0
-# The path of training data
-data = "agaricus.txt.train"
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "agaricus.txt.test"
-# The path of test data
-test:data = "agaricus.txt.test"
-```
-We use the tree booster and logistic regression objective in our setting. This indicates that we accomplish our task using classic gradient boosting regression tree(GBRT), which is a promising method for binary classification.
-
-The parameters shown in the example gives the most common ones that are needed to use xgboost.
-If you are interested in more parameter settings, the complete parameter settings and detailed descriptions are [here](https://xgboost.readthedocs.io/en/stable/parameter.html). Besides putting the parameters in the configuration file, we can set them by passing them as arguments as below:
-
-```
-../../xgboost mushroom.conf max_depth=6
-```
-This means that the parameter max_depth will be set as 6 rather than 3 in the conf file. When you use command line, make sure max_depth=6 is passed in as single argument, i.e. do not contain space in the argument. When a parameter setting is provided in both command line input and  the config file, the command line setting will override the setting in config file.
-
-In this example, we use tree booster for gradient boosting. If you would like to use linear booster for regression, you can keep all the parameters except booster and the tree booster parameters as below:
-```conf
-# General Parameters
-# choose the linear booster
-booster = gblinear
-...
-
-# Change Tree Booster Parameters into Linear Booster Parameters
-# L2 regularization term on weights, default 0
-lambda = 0.01
-# L1 regularization term on weights, default 0
-alpha = 0.01
-# L2 regularization term on bias, default 0
-lambda_bias = 0.01
-
-# Regression Parameters
-...
-```
-
-#### Get Predictions
-After training, we can use the output model to get the prediction of the test data:
-```
-../../xgboost mushroom.conf task=pred model_in=0002.model
-```
-For binary classification, the output predictions are probability confidence scores in [0,1], corresponds to the probability of the label to be positive.
-
-#### Dump Model
-This is a preliminary feature, so only tree models support text dump. XGBoost can display the tree models in text or JSON files, and we can scan the model in an easy way:
-```
-../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
-../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
-```
-
-In this demo, the tree boosters obtained will be printed in dump.raw.txt and dump.nice.txt, and the latter one is easier to understand because of usage of feature mapping featmap.txt
-
-Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
-  - Feature id must be from 0 to number of features, in sorted order.
-  - i means this feature is binary indicator feature
-  - q means this feature is a quantitative value, such as age, time, can be missing
-  - int means this feature is integer value (when int is hinted, the decision boundary will be integer)
-
-#### Monitoring Progress
-When you run training we can find there are messages displayed on screen
-```
-tree train end, 1 roots, 12 extra nodes, 0 pruned nodes ,max_depth=3
-[0]  test-error:0.016139
-boosting round 1, 0 sec elapsed
-
-tree train end, 1 roots, 10 extra nodes, 0 pruned nodes ,max_depth=3
-[1]  test-error:0.000000
-```
-The messages for evaluation are printed into stderr, so if you want only to log the evaluation progress, simply type
-```
-../../xgboost mushroom.conf 2>log.txt
-```
-Then you can find the following content in log.txt
-```
-[0]     test-error:0.016139
-[1]     test-error:0.000000
-```
-We can also monitor both training and test statistics, by adding following lines to configure
-```conf
-eval[test] = "agaricus.txt.test"
-eval[trainname] = "agaricus.txt.train"
-```
-Run the command again, we can find the log file becomes
-```
-[0]     test-error:0.016139     trainname-error:0.014433
-[1]     test-error:0.000000     trainname-error:0.001228
-```
-The rule is eval[name-printed-in-log] = filename, then the file will be added to monitoring process, and evaluated each round.
-
-xgboost also supports monitoring multiple metrics, suppose we also want to monitor average log-likelihood of each prediction during training, simply add ```eval_metric=logloss``` to configure. Run again, we can find the log file becomes
-```
-[0]     test-error:0.016139     test-negllik:0.029795   trainname-error:0.014433        trainname-negllik:0.027023
-[1]     test-error:0.000000     test-negllik:0.000000   trainname-error:0.001228        trainname-negllik:0.002457
-```
-### Saving Progress Models
-If you want to save model every two round, simply set save_period=2. You will find 0002.model in the current folder. If you want to change the output folder of models, add model_dir=foldername. By default xgboost saves the model of last round.
-
-#### Continue from Existing Model
-If you want to continue boosting from existing model, say 0002.ubj, use
-```
-../../xgboost mushroom.conf model_in=0002.model num_round=2 model_out=continue.ubj
-```
-xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
-#### Use Multi-Threading
-When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running add ```nthread``` parameter to your configuration.
-Eg. ```nthread=10```
-
-Set nthread to be the number of your real cpu (On Unix, this can be found using ```lscpu```)
-Some systems will have ```Thread(s) per core = 2```, for example, a 4 core cpu with 8 threads, in such case set ```nthread=4``` and not 8.
diff --git a/demo/CLI/binary_classification/agaricus-lepiota.data b/demo/CLI/binary_classification/agaricus-lepiota.data
deleted file mode 100644
index 14fe8bbe77ce..000000000000
--- a/demo/CLI/binary_classification/agaricus-lepiota.data
+++ /dev/null
@@ -1,8124 +0,0 @@
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,s,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,f,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,x,s,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,f,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,s,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,f,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,y,u
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,s,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,f,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,s,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,s,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,x,f,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,s,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,s,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,s,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,y,u
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,s,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,s,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,s,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,s,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,s,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,f,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,s,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,s,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,s,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,f,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,s,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,f,f,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,y,u
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,f,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,f,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,s,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,f,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,s,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,f,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,f,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,s,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,f,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,f,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,s,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,f,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,s,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,s,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,f,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,f,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,f,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,s,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,f,s,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,x,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,f,s,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,s,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,f,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,s,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,f,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,f,n,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,s,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,s,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,s,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,f,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,s,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,s,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,f,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,s,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,s,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,s,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,f,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,f,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,f,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,f,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,f,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,s,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,s,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,s,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,s,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,s,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,b,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,x,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,s,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,s,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,s,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,f,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,s,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,w,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,s,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,s,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,s,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,y,n,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,f,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,s,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,b,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,y,u
-e,b,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,f,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,b,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-p,x,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,b,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,f,s,w,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,f,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,f,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,b,s,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,y,t,l,f,w,n,w,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,g,f,n,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,y,u
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,y,y,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,f,f,y,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,y,u
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,f,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,b,s,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,s,f,n,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,f,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,y,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-p,x,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,f,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,f,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,s,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,x,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,b,y,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,p
-p,x,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,f,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,b,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,b,y,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,g,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,g,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,y,u
-e,b,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,f,f,n,f,n,f,c,n,g,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,y,n,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,x,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,p
-e,x,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,f,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,y,n,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,b,y,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,s,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,b,s,w,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,f,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,f,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,f,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,s,p
-e,f,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,b,y,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,b,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,b,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,p
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,s,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,s,y,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,y,g
-p,x,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,f,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,b,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,f,w,t,a,f,w,n,p,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,f,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,f,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,f,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,f,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,f,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,y,y,t,a,f,c,b,w,e,r,s,y,w,w,p,w,o,p,k,y,g
-e,x,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,b,s,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,b,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,f,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,b,y,w,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,s,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,g
-e,f,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,x,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,f,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,s,g
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,y,u
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-p,f,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,f,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,g
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,y,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-p,x,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,y,y,t,l,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,y,n,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,x,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,s,y,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,s,m
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,w,t,l,f,c,b,w,e,c,s,s,w,w,p,w,o,p,k,n,g
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,m
-e,f,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,b,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,n,m
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,y,n,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,k,s,g
-e,x,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,y,n,t,l,f,c,b,w,e,r,s,y,w,w,p,w,o,p,n,y,g
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,y,u
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,f,s,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,y,y,t,a,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,y,p
-e,x,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,f,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-e,x,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,g,f,n,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,a,g
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,b,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,f,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,f,y,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,g
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-e,f,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
-e,f,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,s,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,f,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,y,t,a,f,w,n,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,w,t,l,f,w,n,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,s,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,f,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,s,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,w,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,y,y,t,l,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,y,g
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,b,s,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,g,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,s,w,t,l,f,w,n,n,t,b,s,s,w,w,p,w,o,p,u,v,d
-e,x,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,s,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,s,n,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,f,s,w,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,x,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,f,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,s,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-p,x,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-p,f,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
-p,f,y,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,s,n,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,y,y,t,l,f,c,b,n,e,r,s,y,w,w,p,w,o,p,n,s,g
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,f,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,a,g
-p,f,s,n,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,s,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,f,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,s,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,f,f,w,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,s,y,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,w,t,a,f,w,n,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,n,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,a,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,s,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,a,g
-e,b,y,w,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,s,n,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,s,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,f,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,s,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,s,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,y,t,a,f,c,b,p,e,r,s,y,w,w,p,w,o,p,n,s,p
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,n,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,y,w,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,m
-e,x,s,n,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,s,w,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,f,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-p,f,y,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,f,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,w,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,u
-e,f,s,g,f,n,f,w,b,p,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,n,m
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,g,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,f,s,w,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,g,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,g,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,f,f,n,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,f,y,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,n,s,m
-e,x,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,f,n,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,g,f,n,f,w,b,p,t,e,f,s,w,w,p,w,o,e,k,s,g
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,s,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,s,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,f,s,n,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,s,g
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,s,w,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,s,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,g,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,f,g,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,s,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,n,f,n,f,w,b,n,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,s,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,s,n,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,s,w,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,f,y,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,s,u
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,s,w,t,p,f,c,n,w,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,s,g
-e,x,s,g,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,n,s,g
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,n,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,k,a,g
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,n,v,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,s,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,v,g
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,x,s,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,g,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,n,a,g
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,f,y,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
-e,x,s,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,s,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,s,w,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,f,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,n,v,u
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,s,w,f,n,f,w,b,h,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,n,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,s,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,s,w,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,f,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,f,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,s,n,f,n,f,w,b,p,t,e,s,f,w,w,p,w,o,e,k,a,g
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,f,w,f,n,f,w,b,h,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,s,n,f,n,f,w,b,n,t,e,s,s,w,w,p,w,o,e,n,a,g
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,s,g,f,n,f,w,b,h,t,e,f,s,w,w,p,w,o,e,k,s,g
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,x,f,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,s,g,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,f,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,f,w,f,n,f,w,b,h,t,e,s,s,w,w,p,w,o,e,n,s,g
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,n,f,n,f,w,b,n,t,e,f,f,w,w,p,w,o,e,k,s,g
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,s,n,f,n,f,w,b,p,t,e,f,f,w,w,p,w,o,e,n,s,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,x,s,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,s,w,f,n,f,w,b,k,t,e,s,f,w,w,p,w,o,e,k,s,g
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,s,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,n,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,k,a,g
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,f,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,x,f,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,g,f,n,f,w,b,n,t,e,f,s,w,w,p,w,o,e,n,s,g
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,f,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,x,f,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,f,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,x,s,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,s,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,x,s,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,s,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,s,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,s,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,s,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,f,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,x,s,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,s,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,f,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,f,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,s,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,x,f,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,s,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,f,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,s,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,x,s,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,x,s,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,s,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,s,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,s,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,f,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-p,x,f,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,s,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,x,f,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,f,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,x,s,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,s,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,f,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,f,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,s,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,x,f,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,f,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,s,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,s,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,s,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,s,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,f,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,f,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,x,s,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,s,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,s,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,x,s,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,s,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,s,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,x,f,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,s,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,x,f,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,s,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,s,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,f,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,f,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,f,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,x,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,f,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,f,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,s,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,x,f,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,s,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,f,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,s,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,s,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,x,f,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,x,s,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,s,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,s,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,s,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,s,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,f,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,f,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,x,f,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,s,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,f,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,f,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,x,f,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,s,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,f,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-e,x,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,s,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,x,s,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,s,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,s,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,f,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,s,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,x,s,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-p,x,f,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,y,d
-p,x,f,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,x,f,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,x,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,s,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,g,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,f,w,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,s,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,f,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,s,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,x,f,g,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,f,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,y,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,f,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,x,s,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,x,s,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,x,s,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,x,s,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,g,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,x,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,x,f,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,y,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,x,s,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,f,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,x,s,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,s,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,f,w,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,f,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,s,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,x,y,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,f,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,s,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,s,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,g,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,f,g,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,s,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,x,f,g,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,x,s,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-e,f,y,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,f,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,f,w,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-e,x,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,s,p,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,x,f,p,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-e,x,y,g,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,s,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,x,s,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,x,s,p,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,s,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,f,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,f,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-e,x,y,b,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,y,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,x,f,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,f,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,x,f,e,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,s,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,f,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,s,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,s,w,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,f,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,f,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,f,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,s,d
-p,x,s,p,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-e,x,f,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-e,f,f,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-e,f,y,u,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,y,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,x,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,x,f,e,t,n,f,c,b,w,t,b,s,s,p,p,p,w,o,p,k,v,d
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,f,w,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,b,s,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,k,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,k,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,f,f,n,t,n,f,c,b,u,t,b,s,s,w,p,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,s,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,p,g,p,w,o,p,k,v,d
-p,f,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,v,d
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-e,f,f,c,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,k,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,k,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-e,f,f,g,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,p,w,p,w,o,p,n,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,n,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,x,y,g,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,x,f,g,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,p,f,c,f,w,n,n,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,w,w,p,w,o,p,n,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,x,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,y,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,x,f,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-e,k,y,n,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,x,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-e,f,s,p,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-e,k,s,p,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-e,f,y,g,t,n,f,c,b,w,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-e,x,y,r,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,k,y,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,w,t,b,s,s,g,p,p,w,o,p,k,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,f,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-e,x,s,e,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,b,y,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-p,f,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,f,y,g,t,n,f,c,b,n,t,b,s,s,g,p,p,w,o,p,n,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,p,g,p,w,o,p,k,v,d
-e,x,y,b,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-e,x,y,u,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,u,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,f,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-e,f,y,n,t,n,f,c,b,u,t,b,s,s,w,g,p,w,o,p,k,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,x,s,b,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,k,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,s,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,f,y,g,t,n,f,c,b,u,t,b,s,s,p,g,p,w,o,p,k,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,g,g,p,w,o,p,n,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,k,y,b,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,x,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,p,t,b,s,s,w,p,p,w,o,p,k,y,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-e,f,f,c,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,x,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,g
-e,f,s,n,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,g,f,c,f,w,n,u,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-e,x,s,b,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-e,x,y,u,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,v,d
-e,f,y,b,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,f,s,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,g,g,p,w,o,p,n,v,d
-e,x,y,u,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,g,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,x,f,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,f,y,b,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,f,g,f,c,f,c,n,u,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,f,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,g
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,k,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-e,f,y,u,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,x,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,f,y,e,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,x,y,u,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,s,p,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,w,g,p,w,o,p,n,y,d
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,f,w,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,f,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,f,y,u,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,g,w,p,w,o,p,n,v,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,f,f,e,t,n,f,c,b,w,t,b,s,s,w,w,p,w,o,p,k,y,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,f,p,f,c,f,w,n,p,e,b,s,s,w,w,p,w,o,p,k,s,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-e,x,s,p,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,x,s,p,f,c,f,w,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-e,f,f,e,t,n,f,c,b,p,t,b,s,s,p,p,p,w,o,p,n,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-e,f,y,n,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,w,w,p,w,o,p,n,y,d
-p,x,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-e,f,y,n,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,k,f,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-e,x,y,e,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,b,f,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-e,k,y,e,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,k,y,n,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,s,w,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,b,s,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,g
-e,f,y,e,t,n,f,c,b,n,t,b,s,s,w,p,p,w,o,p,n,y,d
-p,b,s,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-e,f,y,p,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,g
-e,k,y,n,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-e,k,s,e,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,f,y,n,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,f,s,b,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-e,f,f,g,t,n,f,c,b,u,t,b,s,s,p,p,p,w,o,p,n,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,f,y,n,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,f,y,e,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,p,w,p,w,o,p,n,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,b,y,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,f,y,p,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,x,f,g,f,c,f,c,n,g,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,x,y,r,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,y,d
-e,f,y,w,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,f,s,e,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,x,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,s,w,f,c,f,c,n,n,e,b,s,s,w,w,p,w,o,p,k,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-e,f,y,e,t,n,f,c,b,u,t,b,s,s,p,w,p,w,o,p,n,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-e,k,s,b,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,x,f,n,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-e,x,y,e,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,f,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,k,s,b,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-e,k,s,p,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,f,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,f,s,b,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,u
-e,f,s,b,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,x,s,p,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,k,s,n,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,y,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,f,y,u,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-e,f,s,b,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-e,x,s,n,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,y,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,b,s,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,f,y,u,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-e,x,y,e,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,k,s,b,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,x,s,g,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,n,v,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,f,y,u,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-e,k,y,b,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,f,y,b,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,f,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,x,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,p
-e,x,y,w,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,y,d
-e,x,s,e,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,d
-e,f,f,e,t,n,f,c,b,n,t,b,s,s,g,w,p,w,o,p,n,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,k,f,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,g
-e,k,f,c,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,b,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,k,y,p,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-e,k,y,b,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,g,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-p,x,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,u
-e,f,y,c,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-e,f,y,p,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,x,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-e,k,y,c,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,x,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,f,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,f,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,g
-e,k,y,c,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,f,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,c,g,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-p,x,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,b,g,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-p,x,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,b,f,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-e,k,s,n,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,k,y,b,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,s,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,s,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-e,x,s,e,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-e,f,f,n,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,x,s,b,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,f,y,n,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,f,y,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,f,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-e,x,y,e,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,f,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,g
-e,k,y,e,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,x,f,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-e,x,y,n,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,f,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,d
-e,x,s,p,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,f,s,p,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,b,y,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,x,y,c,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-e,f,s,n,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,x,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,d
-p,x,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,x,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,g
-e,f,y,c,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,x,y,w,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,f,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,k,s,p,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,f,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,k,y,p,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,x,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,f,f,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-e,k,s,b,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,f,s,n,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-e,k,y,b,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-e,k,y,n,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,d
-e,x,y,p,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,x,s,p,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,b,y,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,x,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,k,y,p,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,k,f,c,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,g
-e,x,s,n,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,k,y,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-e,k,f,n,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-p,k,f,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,f,y,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-e,k,s,b,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,b,y,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,g
-e,k,y,b,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,x,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,b,y,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,g
-e,k,s,n,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,x,f,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,k,y,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,u
-e,f,s,e,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,u
-e,k,s,n,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-e,x,s,b,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,u
-e,k,f,n,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,x,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,k,y,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,s,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-e,k,y,p,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,y,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-p,f,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,u
-e,f,y,w,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,y,d
-e,x,y,r,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,v,d
-e,x,s,b,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,f,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,f,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,g
-e,f,s,e,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,d
-e,x,s,n,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,g
-e,x,f,c,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,f,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-e,x,y,r,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,f,y,e,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,u
-e,k,y,n,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,f,y,e,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,b,y,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-e,f,y,b,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,g
-e,f,y,n,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,b,y,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-e,x,y,n,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,x,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,f,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,f,y,w,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,x,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,x,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,k,y,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,f,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,x,y,c,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,y,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,p
-e,x,s,p,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,f,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-e,f,f,n,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-e,x,f,n,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,x,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,f,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,x,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,g
-e,x,y,r,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,f,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,f,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,x,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,y,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-e,f,s,n,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,f,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,g
-e,k,s,p,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,x,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,b,y,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,s,b,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,u
-e,x,y,p,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,x,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,u
-e,f,y,b,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,u
-e,x,f,c,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-e,f,s,p,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,x,y,n,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-e,f,s,p,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,u
-e,k,s,e,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,k,y,e,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,g
-e,k,s,b,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,b,y,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,x,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,b,f,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,y,c,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,b,y,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,p
-p,f,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,x,s,n,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,v,g
-e,f,s,b,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,f,s,e,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,x,s,n,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
-e,f,s,p,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,x,s,e,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,b,y,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,f,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,x,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,u
-e,f,y,r,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,g
-e,x,f,c,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,u
-e,x,y,u,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,y,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,x,y,p,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,x,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,x,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,f,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,f,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,f,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,g
-e,k,s,e,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,y,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,k,y,n,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,f,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,x,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,b,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,v,g
-p,f,y,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,g
-e,k,y,p,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,g
-e,k,s,p,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,y,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-e,x,y,e,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,d
-e,k,y,e,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,y,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,y,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-e,f,y,r,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,x,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,k,y,e,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,k,s,p,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,x,y,b,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,c,y,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-p,f,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-e,x,y,p,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,f,y,w,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,x,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-e,k,f,n,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,x,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,p
-e,x,y,p,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,b,y,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,f,y,r,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,d
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-e,f,y,n,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,f,y,e,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,x,y,b,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,x,s,n,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,g
-e,x,y,b,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,y,g
-e,k,y,e,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,u
-e,k,y,c,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-e,k,y,n,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,b,s,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,u
-e,f,y,r,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,g
-e,f,y,u,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,u
-e,x,s,e,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,f,y,p,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,p
-e,x,y,n,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,x,y,e,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,f,s,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,g
-e,f,s,p,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,x,y,n,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,b,y,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,d
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,b,y,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,x,y,c,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,f,y,n,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-e,x,y,r,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,b,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-e,x,s,p,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,y,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-e,f,s,b,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,u
-e,x,y,c,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-e,x,y,w,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,f,y,n,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,k,y,p,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,s,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,g
-e,k,y,n,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,f,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,s,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,g
-e,f,s,n,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,b,s,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,u
-e,f,y,b,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-e,x,f,c,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,f,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,u
-e,k,s,p,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,k,y,n,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,x,y,b,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,f,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,p
-e,x,y,e,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,g
-e,x,y,n,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,d
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,b,y,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,p
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,y,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,k,s,e,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,n,p,w,o,l,h,v,p
-e,k,s,n,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,y,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,f,y,g,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,f,s,b,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,x,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-e,f,y,p,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,p
-e,k,s,e,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,x,y,n,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,f,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,u
-p,x,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,x,f,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,g
-e,f,y,e,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,g
-p,x,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,u
-e,k,f,n,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,k,y,c,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-p,x,f,g,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-e,f,y,p,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,y,g
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,f,y,r,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,p
-p,x,s,g,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,x,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,s,b,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,g
-e,x,y,n,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,f,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-e,f,y,u,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,x,s,n,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,b,p,w,o,l,h,v,d
-e,f,s,b,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,k,f,c,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,x,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,f,y,p,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,k,y,p,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,g
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-e,f,s,n,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-e,f,s,n,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,p,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,y,p
-e,f,f,c,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,b,y,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,k,y,e,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,x,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,x,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,w,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,f,s,p,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-e,f,y,w,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,v,d
-e,x,y,p,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,k,g,w,t,n,f,w,n,w,e,b,s,s,w,w,p,w,o,p,w,c,l
-p,f,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,k,s,e,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,s,g
-e,f,s,e,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,f,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,b,s,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-e,f,f,n,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,f,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,x,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,g
-e,x,s,b,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,f,y,n,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,x,y,r,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,v,d
-e,k,s,b,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,x,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-e,x,y,b,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,g
-e,k,s,n,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,g
-e,k,y,p,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,v,p
-p,x,s,b,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,k,f,c,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,f,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,f,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,g
-p,b,s,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,b,s,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,g
-e,x,y,n,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-e,x,s,b,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,f,s,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,f,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,u
-e,f,y,r,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,s,w,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,u
-e,x,y,w,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,k,y,b,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,u
-e,f,y,r,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,b,s,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,s,b,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,f,y,c,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-e,x,y,u,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,u
-e,x,y,w,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,x,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-e,x,s,p,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,s,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,y,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,f,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,y,g,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,v,g
-e,f,y,e,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,b,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,v,u
-p,x,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,y,p
-p,b,y,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,f,y,n,f,n,f,w,n,w,e,b,s,f,w,n,p,w,o,e,w,v,l
-p,f,f,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,b,p,w,o,l,h,v,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,g
-p,x,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,y,y,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-e,f,f,n,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,x,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,f,s,e,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,d
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-e,f,f,c,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-e,k,y,e,t,n,f,c,b,e,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,s,w,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,x,s,n,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,p
-p,x,s,w,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,f,s,e,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,v,d
-e,f,y,e,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,g
-e,k,s,n,t,n,f,c,b,e,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,x,s,w,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,g
-p,f,y,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-e,f,y,w,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,v,d
-e,x,y,w,f,n,f,c,n,h,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,u
-e,x,y,u,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,f,s,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,p,b,p,w,o,l,h,v,d
-p,f,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,x,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,s,g
-e,f,y,r,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,f,s,w,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,s,g
-e,k,s,n,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,s,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,p,p,p,w,o,l,h,y,g
-p,b,s,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,b,s,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,y,g,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,d
-e,x,y,e,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,k,s,b,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-e,x,s,e,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,s,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,m
-e,f,s,n,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-e,f,y,b,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,s,g,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,f,s,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,b,b,p,w,o,l,h,y,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,s,g,t,f,f,c,b,w,t,b,f,s,w,w,p,w,o,p,h,v,u
-e,x,y,p,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,x,y,u,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,y,p
-p,x,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,x,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,b,s,w,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,k,y,n,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,v,p
-e,x,y,b,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-p,x,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,f,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,f,s,b,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,b,p,w,o,l,h,y,d
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,w,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,k,y,n,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,x,s,e,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,x,y,w,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,v,d
-p,f,s,b,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,u
-p,f,s,g,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,s,u
-e,f,s,e,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,y,g
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,s,w,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,y,g,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-e,x,f,n,f,n,f,w,n,w,e,b,s,s,w,n,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,b,n,p,w,o,l,h,y,d
-p,x,s,w,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,u
-e,x,s,b,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,x,s,p,t,n,f,c,b,w,e,?,s,s,w,e,p,w,t,e,w,c,w
-p,x,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,g
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,p
-p,x,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,f,y,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,v,p
-p,f,s,w,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,u
-e,k,y,n,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,x,y,p,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,y,g
-e,f,y,b,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,x,s,e,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,s,g,t,f,f,c,b,p,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,f,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,u
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,p
-p,x,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,f,s,w,t,f,f,c,b,h,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,n,p,p,w,o,l,h,y,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,n,p,p,w,o,l,h,y,p
-p,f,f,g,f,f,f,c,b,g,e,b,k,k,b,n,p,w,o,l,h,v,p
-p,b,s,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,b,s,w,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,x,y,n,t,n,f,c,b,w,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,x,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,b,y,y,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,x,s,g,t,f,f,c,b,w,t,b,f,f,w,w,p,w,o,p,h,v,u
-e,f,y,p,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,k,s,p,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-e,f,y,n,t,n,f,c,b,w,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,b,y,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,s,g,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,x,s,g,t,f,f,c,b,p,t,b,s,s,w,w,p,w,o,p,h,s,g
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,g
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,y,b,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,x,s,b,t,f,f,c,b,h,t,b,f,f,w,w,p,w,o,p,h,v,u
-p,b,s,b,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-e,f,y,w,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,y,d
-e,x,y,n,f,n,f,w,n,w,e,b,f,s,w,n,p,w,o,e,w,v,l
-p,x,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,v,p
-p,f,s,b,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,v,g
-e,k,s,e,t,n,f,c,b,e,e,?,s,s,e,e,p,w,t,e,w,c,w
-p,f,f,y,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,p
-p,b,f,n,f,n,f,c,n,w,e,?,k,y,w,n,p,w,o,e,w,v,d
-p,x,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,y,y,f,f,f,c,b,h,e,b,k,k,n,b,p,w,o,l,h,y,d
-p,x,y,y,f,f,f,c,b,g,e,b,k,k,n,b,p,w,o,l,h,y,g
-p,x,s,g,t,f,f,c,b,p,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-e,x,y,n,t,n,f,c,b,w,e,?,s,s,e,w,p,w,t,e,w,c,w
-e,f,y,n,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-p,f,y,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,k,f,n,f,n,f,c,n,w,e,?,k,y,w,y,p,w,o,e,w,v,d
-p,f,y,g,f,f,f,c,b,h,e,b,k,k,p,n,p,w,o,l,h,v,g
-p,x,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,f,g,f,f,f,c,b,h,e,b,k,k,p,p,p,w,o,l,h,y,d
-p,f,f,g,f,f,f,c,b,p,e,b,k,k,b,p,p,w,o,l,h,v,d
-p,x,s,g,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,s,g
-e,x,y,r,f,n,f,c,n,u,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,x,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,s,p,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,m
-p,x,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,g
-p,x,s,b,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,v,u
-e,k,s,e,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-p,f,s,b,t,f,f,c,b,w,t,b,s,f,w,w,p,w,o,p,h,s,g
-e,x,f,n,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,v,d
-e,f,y,w,f,n,f,c,n,w,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,s,w,t,f,f,c,b,w,t,b,s,s,w,w,p,w,o,p,h,s,u
-p,f,s,p,t,n,f,c,b,r,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,y,y,f,f,f,c,b,p,e,b,k,k,p,n,p,w,o,l,h,y,d
-p,f,s,g,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,g
-p,x,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,b,p,w,o,l,h,v,g
-p,f,s,g,t,f,f,c,b,h,t,b,f,s,w,w,p,w,o,p,h,s,g
-p,f,s,w,t,f,f,c,b,h,t,b,s,f,w,w,p,w,o,p,h,s,u
-p,x,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,f,y,f,f,f,c,b,p,e,b,k,k,n,n,p,w,o,l,h,y,d
-p,x,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,b,y,b,t,n,f,c,b,g,e,b,s,s,w,w,p,w,t,p,r,v,g
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,b,p,p,w,o,l,h,v,g
-p,x,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-e,k,y,b,t,n,f,c,b,e,e,?,s,s,w,w,p,w,t,e,w,c,w
-e,x,y,w,f,n,f,c,n,p,e,?,s,f,w,w,p,w,o,f,h,y,d
-p,f,s,b,t,f,f,c,b,p,t,b,s,f,w,w,p,w,o,p,h,v,u
-p,f,y,y,f,f,f,c,b,g,e,b,k,k,n,p,p,w,o,l,h,y,g
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,c,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,x,f,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-e,x,s,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-e,b,f,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,k,f,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,k,s,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-e,k,s,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-e,k,s,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-e,b,s,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,x,f,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-e,k,f,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,x,s,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,b,f,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,c,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,k,s,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-e,x,f,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,k,f,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,x,s,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,b,f,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,y,c,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-e,x,f,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,k,f,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,c,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,x,s,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,c,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-e,k,s,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-e,k,f,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,x,y,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,c,l
-e,k,s,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,x,f,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,c,l
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,b,y,y,f,n,f,w,n,y,e,c,y,y,y,y,p,y,o,e,w,c,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-e,b,s,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,k,f,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-e,b,f,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,c,l
-e,x,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,b,s,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,b,s,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-e,f,y,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,x,f,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,k,f,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-e,b,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-p,f,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-e,x,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,k,f,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,f,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,b,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,c,l
-e,k,s,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,c,l
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-e,k,f,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,x,s,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,x,s,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,k,s,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,k,s,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,k,s,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,x,f,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-e,b,s,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,k,s,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,x,s,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,k,f,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-e,f,s,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,c,l
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-e,k,f,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,c,l
-p,k,y,c,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,x,f,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,v,l
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,k,f,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,x,f,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,k,f,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-e,k,s,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,c,l
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-e,x,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,y,n,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,k,s,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-e,f,s,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,c,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,b,f,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,v,l
-p,f,y,n,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,x,f,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,y,c,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,b,s,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-e,b,f,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,f,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-e,x,y,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,k,f,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,x,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,k,s,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,v,l
-e,k,s,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,v,l
-e,x,f,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,b,s,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,f,y,c,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-e,k,s,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,x,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,x,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,b,s,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,f,y,n,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,b,f,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,k,s,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,x,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,f,y,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,c,l
-e,b,s,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-e,k,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-e,k,f,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-e,b,f,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,x,s,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,x,f,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,k,s,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,x,s,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,v,l
-e,k,s,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,x,s,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,v,l
-e,k,s,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,b,s,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,c,l
-e,b,s,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,c,l
-e,b,s,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,c,l
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,v,l
-e,k,f,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,c,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,k,s,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-e,k,f,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,x,s,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,n,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,b,f,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,k,f,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-e,k,s,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-e,k,s,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,b,s,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,x,s,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,x,f,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,x,y,e,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-e,f,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-e,k,s,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,k,y,y,f,n,f,w,n,y,e,c,y,y,y,y,p,y,o,e,w,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,c,l
-e,x,s,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,k,s,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,b,f,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,c,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-e,x,f,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,x,s,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,x,s,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-e,b,f,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,x,y,n,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,c,l
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,v,l
-e,b,s,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,b,s,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,b,f,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,k,s,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,f,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,y,c,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-e,k,s,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-e,b,f,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,f,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-e,k,s,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,x,s,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,c,l
-e,f,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,b,s,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,x,f,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,x,s,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,k,f,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,c,l
-p,f,y,y,f,n,f,w,n,w,e,c,y,y,y,y,p,y,o,e,w,c,l
-p,x,y,c,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,x,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-e,k,s,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,b,s,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,v,l
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,v,l
-e,f,s,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,k,s,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,b,s,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,b,s,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,x,s,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-e,b,f,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,c,y,y,f,n,f,w,n,y,e,c,y,y,y,y,p,y,o,e,w,c,l
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-e,x,s,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,c,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,v,l
-p,x,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-e,x,s,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-e,x,f,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,f,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,c,l
-p,f,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,k,s,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,x,f,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,f,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,f,y,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,x,f,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,k,s,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,k,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-e,x,f,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
-p,f,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,c,l
-e,x,f,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,x,s,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,c,l
-e,b,s,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,x,y,e,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,x,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,f,s,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,y,f,n,f,w,n,y,e,c,y,y,y,y,p,y,o,e,w,c,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,f,y,e,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,x,f,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-e,x,s,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,x,s,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,x,f,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,c,l
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,x,s,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,x,f,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,c,l
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,c,l
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,c,l
-e,b,s,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,c,l
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,b,s,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
-e,x,s,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,x,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-e,k,s,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,x,s,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,y,c,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,x,s,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,k,f,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,b,f,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,v,l
-e,f,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-e,b,s,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-e,k,s,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,v,l
-e,x,f,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,x,f,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-e,x,s,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-e,x,f,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-e,k,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,k,f,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-e,k,f,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-e,b,f,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,x,s,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,b,s,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-e,b,s,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,f,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,k,s,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,x,s,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,b,y,y,f,n,f,w,n,w,e,c,y,y,y,y,p,y,o,e,w,c,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-e,x,s,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,k,f,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,x,f,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,b,s,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,f,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,b,f,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,k,s,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,x,s,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,c,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,v,l
-e,b,f,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,x,s,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,x,y,e,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,c,l
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-e,b,s,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,b,s,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,k,f,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,k,f,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,x,f,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,f,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,b,s,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,k,s,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,b,f,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,f,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,x,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,v,l
-e,k,f,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,c,l
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,x,s,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,f,y,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-e,b,f,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-e,b,f,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-e,x,s,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,x,s,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,v,l
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,c,l
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,k,f,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,v,l
-e,f,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-e,f,s,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,x,s,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,c,y,y,f,n,f,w,n,w,e,c,y,y,y,y,p,y,o,e,w,c,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,x,s,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,x,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,y,c,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,f,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,c,l
-e,b,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,b,s,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,x,y,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,f,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,c,l
-e,b,f,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,x,f,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,x,y,n,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,x,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,y,y,f,n,f,w,n,w,e,c,y,y,y,y,p,y,o,e,w,c,l
-e,k,s,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,k,f,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,k,f,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,c,l
-e,x,s,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,b,s,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,x,f,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,k,f,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-e,k,f,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,x,s,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,x,s,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,v,l
-e,b,s,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,b,f,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,b,s,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,x,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-e,x,s,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,x,s,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,k,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,v,l
-e,b,f,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-p,f,y,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,f,y,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,k,s,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,k,s,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-e,x,s,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,f,y,c,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,x,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-e,k,f,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,x,y,n,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,f,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,b,f,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,b,f,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,c,l
-e,x,s,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
-e,x,s,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,b,s,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-e,x,f,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,v,l
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,v,l
-e,k,s,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,v,l
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-e,b,f,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,l
-e,x,s,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,x,s,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,x,f,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,x,s,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,x,f,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,c,l
-e,b,s,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,v,l
-e,x,s,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-e,k,s,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,v,l
-e,k,s,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,x,s,n,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,x,f,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,f,s,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,v,l
-e,x,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,l
-e,b,s,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,c,l
-e,x,s,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,x,s,g,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-p,f,y,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,f,y,n,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-e,f,y,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-p,x,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,x,f,g,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,x,y,e,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,b,f,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,x,f,g,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,f,y,n,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,v,l
-e,b,s,g,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,v,l
-e,x,f,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,x,f,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,f,y,p,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,c,l
-p,k,s,n,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,d
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,c,l
-p,k,y,n,f,s,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-e,k,f,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-e,f,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,x,f,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,c,l
-e,k,s,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,b,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,c,l
-e,f,y,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-p,k,y,e,f,m,a,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,x,y,e,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,y,v,l
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,o,c,l
-e,x,y,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,f,s,c,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,v,p
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,c,l
-e,b,f,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,k,f,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,c,l
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,c,l
-e,k,f,w,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-p,k,y,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,l
-e,b,f,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,v,l
-e,x,y,n,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-e,b,s,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,n,g
-p,f,y,n,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,c,l
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,v,l
-e,b,s,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,x,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,b,f,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,b,f,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,b,f,w,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,k,y,e,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,d
-p,k,y,c,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-e,x,s,g,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,d
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,c,l
-e,b,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-p,x,s,n,f,f,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,c,l
-p,f,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-e,k,s,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-e,k,f,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,c,l
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,v,l
-p,k,s,n,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,c,l
-e,x,y,n,f,n,f,c,b,w,e,b,y,y,n,n,p,w,t,p,w,y,p
-e,x,s,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,f,s,n,f,f,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,x,y,e,f,s,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,d
-p,k,y,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,n,v,l
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,v,l
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,v,l
-e,k,s,g,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,s,g
-e,k,f,w,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-p,k,s,e,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,p
-e,b,f,w,f,n,f,w,b,g,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,x,s,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,b,f,w,f,n,f,w,b,p,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,d
-p,f,s,n,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-p,f,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,p
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,c,l
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-e,x,f,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,k,s,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
-e,b,f,w,f,n,f,w,b,p,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,p,w,p,w,o,e,w,v,d
-e,x,y,g,t,n,f,c,b,w,e,b,s,s,w,w,p,w,t,p,w,y,p
-p,k,s,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,b,v,l
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,b,f,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,s,e,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,s,n,f,y,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,d
-e,x,f,w,f,n,f,w,b,w,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,f,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,k,y,n,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,x,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,v,l
-e,b,f,g,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,y,v,l
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,s,k,w,p,p,w,o,e,w,v,l
-p,k,s,n,f,f,f,c,n,b,t,?,s,s,w,p,p,w,o,e,w,v,p
-p,k,s,n,f,s,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,s,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-e,k,f,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,v,l
-p,k,s,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-e,b,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,b,c,l
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,n,c,l
-p,k,y,e,f,s,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,l
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,y,v,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,y,v,l
-e,b,f,g,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,n,g
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,o,c,l
-e,b,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,y,c,l
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,o,v,l
-e,b,f,g,f,n,f,w,b,g,e,?,s,s,w,w,p,w,t,p,w,s,g
-p,k,y,e,f,f,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-p,k,s,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,w,w,p,w,o,e,w,v,p
-p,k,s,e,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,p
-p,k,y,n,f,y,f,c,n,b,t,?,s,s,w,w,p,w,o,e,w,v,l
-e,b,f,g,f,n,f,w,b,p,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,k,f,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,n,o,p,o,v,l
-p,x,s,e,f,f,f,c,n,b,t,?,k,s,w,p,p,w,o,e,w,v,p
-e,k,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,n,v,l
-p,k,y,e,f,f,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-p,k,s,n,f,f,f,c,n,b,t,?,k,s,p,p,p,w,o,e,w,v,d
-p,k,y,e,f,f,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,p
-p,k,y,e,f,y,f,c,n,b,t,?,s,s,p,p,p,w,o,e,w,v,p
-p,x,s,n,f,y,f,c,n,b,t,?,k,k,w,w,p,w,o,e,w,v,d
-e,b,s,g,f,n,f,w,b,g,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,x,y,c,f,m,f,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,k,f,w,f,n,f,w,b,w,e,?,k,s,w,w,p,w,t,p,w,n,g
-p,k,y,n,f,s,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
-e,k,f,w,f,n,f,w,b,w,e,?,k,k,w,w,p,w,t,p,w,s,g
-e,f,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,v,l
-p,k,s,e,f,s,f,c,n,b,t,?,s,s,p,w,p,w,o,e,w,v,p
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,n,c,l
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,o,c,l
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,v,l
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,y,v,l
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,v,l
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,n,c,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,l
-e,b,s,w,f,n,f,w,b,w,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,x,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,o,o,p,n,v,l
-e,k,s,w,f,n,f,w,b,p,e,?,s,s,w,w,p,w,t,p,w,n,g
-e,k,s,n,f,n,a,c,b,o,e,?,s,s,o,o,p,n,o,p,b,v,l
-p,k,y,e,f,y,f,c,n,b,t,?,k,k,p,p,p,w,o,e,w,v,d
-p,f,y,c,f,m,a,c,b,y,e,c,k,y,c,c,p,w,n,n,w,c,d
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,v,l
-p,k,y,n,f,s,f,c,n,b,t,?,s,k,p,w,p,w,o,e,w,v,l
-p,k,s,e,f,y,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-p,k,y,n,f,f,f,c,n,b,t,?,k,s,p,w,p,w,o,e,w,v,d
-e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
-e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
-p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
-e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,c,l
diff --git a/demo/CLI/binary_classification/agaricus-lepiota.fmap b/demo/CLI/binary_classification/agaricus-lepiota.fmap
deleted file mode 100644
index e1efc285eff8..000000000000
--- a/demo/CLI/binary_classification/agaricus-lepiota.fmap
+++ /dev/null
@@ -1,32 +0,0 @@
-     1. cap-shape:                bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
-     2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
-     3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
-     4. bruises?:                 bruises=t,no=f
-     5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f,
-                                  musty=m,none=n,pungent=p,spicy=s
-     6. gill-attachment:          attached=a,descending=d,free=f,notched=n
-     7. gill-spacing:             close=c,crowded=w,distant=d
-     8. gill-size:                broad=b,narrow=n
-     9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g,
-                                  green=r,orange=o,pink=p,purple=u,red=e,
-                                  white=w,yellow=y
-    10. stalk-shape:              enlarging=e,tapering=t
-    11. stalk-root:               bulbous=b,club=c,cup=u,equal=e,
-                                  rhizomorphs=z,rooted=r,missing=?
-    12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
-    13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
-    14. stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
-                                  pink=p,red=e,white=w,yellow=y
-    15. stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
-                                  pink=p,red=e,white=w,yellow=y
-    16. veil-type:                partial=p,universal=u
-    17. veil-color:               brown=n,orange=o,white=w,yellow=y
-    18. ring-number:              none=n,one=o,two=t
-    19. ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l,
-                                  none=n,pendant=p,sheathing=s,zone=z
-    20. spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r,
-                                  orange=o,purple=u,white=w,yellow=y
-    21. population:               abundant=a,clustered=c,numerous=n,
-                                  scattered=s,several=v,solitary=y
-    22. habitat:                  grasses=g,leaves=l,meadows=m,paths=p,
-                                  urban=u,waste=w,woods=d
diff --git a/demo/CLI/binary_classification/agaricus-lepiota.names b/demo/CLI/binary_classification/agaricus-lepiota.names
deleted file mode 100644
index 4f1f3b53e45f..000000000000
--- a/demo/CLI/binary_classification/agaricus-lepiota.names
+++ /dev/null
@@ -1,148 +0,0 @@
-1. Title: Mushroom Database
-
-2. Sources: 
-    (a) Mushroom records drawn from The Audubon Society Field Guide to North
-        American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred
-        A. Knopf
-    (b) Donor: Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
-    (c) Date: 27 April 1987
-
-3. Past Usage:
-    1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
-       Adjustment (Technical Report 87-19).  Doctoral disseration, Department
-       of Information and Computer Science, University of California, Irvine.
-       --- STAGGER: asymptoted to 95% classification accuracy after reviewing
-           1000 instances.
-    2. Iba,W., Wogulis,J., & Langley,P. (1988).  Trading off Simplicity
-       and Coverage in Incremental Concept Learning. In Proceedings of 
-       the 5th International Conference on Machine Learning, 73-79.
-       Ann Arbor, Michigan: Morgan Kaufmann.  
-       -- approximately the same results with their HILLARY algorithm    
-    3. In the following references a set of rules (given below) were
-	learned for this data set which may serve as a point of
-	comparison for other researchers.
-
-	Duch W, Adamczak R, Grabczewski K (1996) Extraction of logical rules
-	from training data using backpropagation networks, in: Proc. of the
-	The 1st Online Workshop on Soft Computing, 19-30.Aug.1996, pp. 25-30,
-	available on-line at: http://www.bioele.nuee.nagoya-u.ac.jp/wsc1/
-
-	Duch W, Adamczak R, Grabczewski K, Ishikawa M, Ueda H, Extraction of
-	crisp logical rules using constrained backpropagation networks -
-	comparison of two new approaches, in: Proc. of the European Symposium
-	on Artificial Neural Networks (ESANN'97), Bruge, Belgium 16-18.4.1997,
-	pp. xx-xx
-
-	Wlodzislaw Duch, Department of Computer Methods, Nicholas Copernicus
-	University, 87-100 Torun, Grudziadzka 5, Poland
-	e-mail: duch@phys.uni.torun.pl
-	WWW     http://www.phys.uni.torun.pl/kmk/
-	
-	Date: Mon, 17 Feb 1997 13:47:40 +0100
-	From: Wlodzislaw Duch <duch@phys.uni.torun.pl>
-	Organization: Dept. of Computer Methods, UMK
-
-	I have attached a file containing logical rules for mushrooms.
-	It should be helpful for other people since only in the last year I
-	have seen about 10 papers analyzing this dataset and obtaining quite
-	complex rules. We will try to contribute other results later.
-
-	With best regards, Wlodek Duch
-	________________________________________________________________
-
-	Logical rules for the mushroom data sets.
-
-	Logical rules given below seem to be the simplest possible for the
-	mushroom dataset and therefore should be treated as benchmark results.
-
-	Disjunctive rules for poisonous mushrooms, from most general
-	to most specific:
-
-	P_1) odor=NOT(almond.OR.anise.OR.none)
-	     120 poisonous cases missed, 98.52% accuracy
-
-	P_2) spore-print-color=green
-	     48 cases missed, 99.41% accuracy
-         
-	P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.
-	          (stalk-color-above-ring=NOT.brown) 
-	     8 cases missed, 99.90% accuracy
-         
-	P_4) habitat=leaves.AND.cap-color=white
-	         100% accuracy     
-
-	Rule P_4) may also be
-
-	P_4') population=clustered.AND.cap_color=white
-
-	These rule involve 6 attributes (out of 22). Rules for edible
-	mushrooms are obtained as negation of the rules given above, for
-	example the rule:
-
-	odor=(almond.OR.anise.OR.none).AND.spore-print-color=NOT.green
-
-	gives 48 errors, or 99.41% accuracy on the whole dataset.
-
-	Several slightly more complex variations on these rules exist,
-	involving other attributes, such as gill_size, gill_spacing,
-	stalk_surface_above_ring, but the rules given above are the simplest
-	we have found.
-
-
-4. Relevant Information:
-    This data set includes descriptions of hypothetical samples
-    corresponding to 23 species of gilled mushrooms in the Agaricus and
-    Lepiota Family (pp. 500-525).  Each species is identified as
-    definitely edible, definitely poisonous, or of unknown edibility and
-    not recommended.  This latter class was combined with the poisonous
-    one.  The Guide clearly states that there is no simple rule for
-    determining the edibility of a mushroom; no rule like ``leaflets
-    three, let it be'' for Poisonous Oak and Ivy.
-
-5. Number of Instances: 8124
-
-6. Number of Attributes: 22 (all nominally valued)
-
-7. Attribute Information: (classes: edible=e, poisonous=p)
-     1. cap-shape:                bell=b,conical=c,convex=x,flat=f,
-                                  knobbed=k,sunken=s
-     2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
-     3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r,
-                                  pink=p,purple=u,red=e,white=w,yellow=y
-     4. bruises?:                 bruises=t,no=f
-     5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f,
-                                  musty=m,none=n,pungent=p,spicy=s
-     6. gill-attachment:          attached=a,descending=d,free=f,notched=n
-     7. gill-spacing:             close=c,crowded=w,distant=d
-     8. gill-size:                broad=b,narrow=n
-     9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g,
-                                  green=r,orange=o,pink=p,purple=u,red=e,
-                                  white=w,yellow=y
-    10. stalk-shape:              enlarging=e,tapering=t
-    11. stalk-root:               bulbous=b,club=c,cup=u,equal=e,
-                                  rhizomorphs=z,rooted=r,missing=?
-    12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
-    13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
-    14. stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
-                                  pink=p,red=e,white=w,yellow=y
-    15. stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
-                                  pink=p,red=e,white=w,yellow=y
-    16. veil-type:                partial=p,universal=u
-    17. veil-color:               brown=n,orange=o,white=w,yellow=y
-    18. ring-number:              none=n,one=o,two=t
-    19. ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l,
-                                  none=n,pendant=p,sheathing=s,zone=z
-    20. spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r,
-                                  orange=o,purple=u,white=w,yellow=y
-    21. population:               abundant=a,clustered=c,numerous=n,
-                                  scattered=s,several=v,solitary=y
-    22. habitat:                  grasses=g,leaves=l,meadows=m,paths=p,
-                                  urban=u,waste=w,woods=d
-
-8. Missing Attribute Values: 2480 of them (denoted by "?"), all for
-   attribute #11.
-
-9. Class Distribution: 
-    --    edible: 4208 (51.8%)
-    -- poisonous: 3916 (48.2%)
-    --     total: 8124 instances
diff --git a/demo/CLI/binary_classification/mapfeat.py b/demo/CLI/binary_classification/mapfeat.py
deleted file mode 100755
index 1c8ac9ab39a7..000000000000
--- a/demo/CLI/binary_classification/mapfeat.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python3
-
-def loadfmap( fname ):
-    fmap = {}
-    nmap = {}
-
-    for l in open( fname ):
-        arr = l.split()
-        if arr[0].find('.') != -1:
-            idx = int( arr[0].strip('.') )
-            assert idx not in fmap
-            fmap[ idx ] = {}
-            ftype = arr[1].strip(':')
-            content = arr[2]
-        else:
-            content = arr[0]
-        for it in content.split(','):
-            if it.strip() == '':
-                continue
-            k , v = it.split('=')
-            fmap[ idx ][ v ] = len(nmap)
-            nmap[ len(nmap) ] = ftype+'='+k
-    return fmap, nmap
-
-def write_nmap( fo, nmap ):
-    for i in range( len(nmap) ):
-        fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
-
-# start here
-fmap, nmap = loadfmap( 'agaricus-lepiota.fmap' )
-fo = open( 'featmap.txt', 'w' )
-write_nmap( fo, nmap )
-fo.close()
-
-fo = open( 'agaricus.txt', 'w' )
-for l in open( 'agaricus-lepiota.data' ):
-    arr = l.split(',')
-    if arr[0] == 'p':
-        fo.write('1')
-    else:
-        assert arr[0] == 'e'
-        fo.write('0')
-    for i in range( 1,len(arr) ):
-        fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
-    fo.write('\n')
-
-fo.close()
diff --git a/demo/CLI/binary_classification/mknfold.py b/demo/CLI/binary_classification/mknfold.py
deleted file mode 100755
index 3f178e05556c..000000000000
--- a/demo/CLI/binary_classification/mknfold.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python3
-
-import random
-import sys
-
-if len(sys.argv) < 2:
-    print ('Usage:<filename> <k> [nfold = 5]')
-    exit(0)
-
-random.seed( 10 )
-
-k = int( sys.argv[2] )
-if len(sys.argv) > 3:
-    nfold = int( sys.argv[3] )
-else:
-    nfold = 5
-
-fi = open( sys.argv[1], 'r' )
-ftr = open( sys.argv[1]+'.train', 'w' )
-fte = open( sys.argv[1]+'.test', 'w' )
-for l in fi:
-    if random.randint( 1 , nfold ) == k:
-        fte.write( l )
-    else:
-        ftr.write( l )
-
-fi.close()
-ftr.close()
-fte.close()
diff --git a/demo/CLI/binary_classification/mushroom.conf b/demo/CLI/binary_classification/mushroom.conf
deleted file mode 100644
index d78199cd767a..000000000000
--- a/demo/CLI/binary_classification/mushroom.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0
-# minimum loss reduction required to make a further partition
-gamma = 1.0
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1
-# maximum depth of a tree
-max_depth = 3
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 2
-# The path of training data
-data = "agaricus.txt.train?format=libsvm"
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "agaricus.txt.test?format=libsvm"
-# evaluate on training data as well each round
-eval_train = 1
-# The path of test data
-test:data = "agaricus.txt.test?format=libsvm"
diff --git a/demo/CLI/binary_classification/runexp.sh b/demo/CLI/binary_classification/runexp.sh
deleted file mode 100755
index 8a4d248a0bdf..000000000000
--- a/demo/CLI/binary_classification/runexp.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# map feature using indicator encoding, also produce featmap.txt
-python mapfeat.py
-# split train and test
-python mknfold.py agaricus.txt 1
-
-XGBOOST=../../../xgboost
-
-# training and output the models
-$XGBOOST mushroom.conf
-# output prediction task=pred
-$XGBOOST mushroom.conf task=pred model_in=0002.ubj
-# print the boosters of 00002.ubj in dump.raw.txt
-$XGBOOST mushroom.conf task=dump model_in=0002.ubj name_dump=dump.raw.txt
-# use the feature map in printing for better visualization
-$XGBOOST mushroom.conf task=dump model_in=0002.ubj fmap=featmap.txt name_dump=dump.nice.txt
-cat dump.nice.txt
diff --git a/demo/CLI/distributed-training/README.md b/demo/CLI/distributed-training/README.md
deleted file mode 100644
index 7a7a019c78b0..000000000000
--- a/demo/CLI/distributed-training/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-Distributed XGBoost Training
-============================
-This is an tutorial of Distributed XGBoost Training.
-Currently xgboost supports distributed training via CLI program with the configuration file.
-There is also plan push distributed python and other language bindings, please open an issue
-if you are interested in contributing.
-
-Build XGBoost with Distributed Filesystem Support
--------------------------------------------------
-To use distributed xgboost, you only need to turn the options on to build
-with distributed filesystems(HDFS or S3) in cmake.
-
-```
-cmake <path/to/xgboost> -DUSE_HDFS=ON -DUSE_S3=ON -DUSE_AZURE=ON
-```
-
-
-Step by Step Tutorial on AWS
-----------------------------
-Checkout [this tutorial](https://xgboost.readthedocs.org/en/latest/tutorials/aws_yarn.html) for running distributed xgboost.
-
-
-Model Analysis
---------------
-XGBoost is exchangeable across all bindings and platforms.
-This means you can use python or R to analyze the learnt model and do prediction.
-For example, you can use the [plot_model.ipynb](plot_model.ipynb) to visualize the learnt model.
diff --git a/demo/CLI/distributed-training/mushroom.aws.conf b/demo/CLI/distributed-training/mushroom.aws.conf
deleted file mode 100644
index 04283768c33a..000000000000
--- a/demo/CLI/distributed-training/mushroom.aws.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0
-# minimum loss reduction required to make a further partition
-gamma = 1.0
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1
-# maximum depth of a tree
-max_depth = 3
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0
-# The path of training data
-data = "s3://mybucket/xgb-demo/train"
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-# evaluate on training data as well each round
-eval_train = 1
-
diff --git a/demo/CLI/distributed-training/plot_model.ipynb b/demo/CLI/distributed-training/plot_model.ipynb
deleted file mode 100644
index 227f960a0b59..000000000000
--- a/demo/CLI/distributed-training/plot_model.ipynb
+++ /dev/null
@@ -1,107 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# XGBoost Model Analysis\n",
-    "\n",
-    "This notebook can be used to load and analysis model learnt from all xgboost bindings, including distributed training. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "import os\n",
-    "%matplotlib inline "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Please change the ```pkg_path``` and ```model_file``` to be correct path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "pkg_path = '../../python-package/'\n",
-    "model_file = 's3://my-bucket/xgb-demo/model/0002.model'\n",
-    "sys.path.insert(0, pkg_path)\n",
-    "import xgboost as xgb"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Plot the Feature Importance"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "# plot the first two trees.\n",
-    "bst = xgb.Booster(model_file=model_file)\n",
-    "xgb.plot_importance(bst)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Plot the First Tree"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "tree_id = 0\n",
-    "xgb.to_graphviz(bst, tree_id)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/demo/CLI/distributed-training/run_aws.sh b/demo/CLI/distributed-training/run_aws.sh
deleted file mode 100644
index d7223ea542b5..000000000000
--- a/demo/CLI/distributed-training/run_aws.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-# This is the example script to run distributed xgboost on AWS.
-# Change the following two lines for configuration
-
-export BUCKET=mybucket
-
-# submit the job to YARN
-../../../dmlc-core/tracker/dmlc-submit --cluster=yarn --num-workers=2 --worker-cores=2\
-				       ../../../xgboost mushroom.aws.conf nthread=2\
-				       data=s3://${BUCKET}/xgb-demo/train\
-				       eval[test]=s3://${BUCKET}/xgb-demo/test\
-				       model_dir=s3://${BUCKET}/xgb-demo/model
diff --git a/demo/CLI/regression/README.md b/demo/CLI/regression/README.md
deleted file mode 100644
index 2525f9824410..000000000000
--- a/demo/CLI/regression/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-Regression
-====
-Using XGBoost for regression is very similar to using it for binary classification. We suggest that you can refer to the [binary classification demo](../binary_classification) first. In XGBoost if we use negative log likelihood as the loss function for regression, the training procedure is same as training binary classifier of XGBoost.
-
-### Tutorial
-The dataset we used is the [computer hardware dataset from UCI repository](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware). The demo for regression is almost the same as the [binary classification demo](../binary_classification), except a little difference in general parameter:
-```
-# General parameter
-# this is the only difference with classification, use reg:squarederror to do linear regression
-# when labels are in [0,1] we can also use reg:logistic
-objective = reg:squarederror
-...
-
-```
-
-The input format is same as binary classification, except that the label is now the target regression values. We use linear regression here, if we want use objective = reg:logistic logistic regression, the label needed to be pre-scaled into [0,1].
diff --git a/demo/CLI/regression/machine.conf b/demo/CLI/regression/machine.conf
deleted file mode 100644
index 42e2b1227298..000000000000
--- a/demo/CLI/regression/machine.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the tree booster, can also change to gblinear
-booster = gbtree
-# this is the only difference with classification, use reg:squarederror to do linear classification
-# when labels are in [0,1] we can also use reg:logistic
-objective = reg:squarederror
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0
-# minimum loss reduction required to make a further partition
-gamma = 1.0
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1
-# maximum depth of a tree
-max_depth = 3
-
-# Task parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0
-# The path of training data
-data = "machine.txt.train?format=libsvm"
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "machine.txt.test?format=libsvm"
-# The path of test data
-test:data = "machine.txt.test?format=libsvm"
diff --git a/demo/CLI/regression/runexp.sh b/demo/CLI/regression/runexp.sh
deleted file mode 100755
index 900a80ccef2e..000000000000
--- a/demo/CLI/regression/runexp.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-# map the data to features. For convenience we only use 7 original attributes and encode them as features in a trivial way 
-python mapfeat.py
-# split train and test
-python mknfold.py machine.txt 1
-# training and output the models
-../../xgboost machine.conf
-# output predictions of test data
-../../xgboost machine.conf task=pred model_in=0002.model
-# print the boosters of 0002.model in dump.raw.txt
-../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
-# print the boosters of 0002.model in dump.nice.txt with feature map
-../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt 
-
-# cat the result
-cat dump.nice.txt
diff --git a/demo/CLI/yearpredMSD/README.md b/demo/CLI/yearpredMSD/README.md
deleted file mode 100644
index 3fe35056a4b1..000000000000
--- a/demo/CLI/yearpredMSD/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-Demonstrating how to use XGBoost on [Year Prediction task of Million Song Dataset](https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD)
-
-1. Run runexp.sh
-```bash
-./runexp.sh
-```
-
-You can also use the script to prepare LIBSVM format, and run the [Distributed Version](../../multi-node).
-Note that though that normally you only need to use single machine for dataset at this scale, and use distributed version for larger scale dataset.
diff --git a/demo/CLI/yearpredMSD/csv2libsvm.py b/demo/CLI/yearpredMSD/csv2libsvm.py
deleted file mode 100755
index ead362ae2293..000000000000
--- a/demo/CLI/yearpredMSD/csv2libsvm.py
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-
-fo = open(sys.argv[2], 'w')
-
-for l in open(sys.argv[1]):
-    arr = l.split(',')
-    fo.write('%s' % arr[0])
-    for i in range(len(arr) - 1):
-        fo.write(' %d:%s' % (i, arr[i+1]))
-fo.close()
diff --git a/demo/CLI/yearpredMSD/runexp.sh b/demo/CLI/yearpredMSD/runexp.sh
deleted file mode 100755
index 4ec58025ed67..000000000000
--- a/demo/CLI/yearpredMSD/runexp.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-if [ -f YearPredictionMSD.txt ]
-then
-    echo "use existing data to run experiment"
-else
-    echo "getting data from uci, make sure you are connected to internet"
-    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
-    unzip YearPredictionMSD.txt.zip
-fi
-echo "start making data.."
-# map feature using indicator encoding, also produce featmap.txt
-python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm
-head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train
-tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test
-echo "finish making the data"
-../../../xgboost yearpredMSD.conf
diff --git a/demo/CLI/yearpredMSD/yearpredMSD.conf b/demo/CLI/yearpredMSD/yearpredMSD.conf
deleted file mode 100644
index 36cdf39c9847..000000000000
--- a/demo/CLI/yearpredMSD/yearpredMSD.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the tree booster, can also change to gblinear
-booster = gbtree
-# this is the only difference with classification, use reg:squarederror to do linear classification
-# when labels are in [0,1] we can also use reg:logistic
-objective = reg:squarederror
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0
-# minimum loss reduction required to make a further partition
-gamma = 1.0
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1
-# maximum depth of a tree
-max_depth = 5
-
-base_score = 2001
-# Task parameters
-# the number of round to do boosting
-num_round = 100
-# 0 means do not save any model except the final round model
-save_period = 0
-# The path of training data
-data = "yearpredMSD.libsvm.train"
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "yearpredMSD.libsvm.test"
-# The path of test data
-#test:data = "yearpredMSD.libsvm.test"
diff --git a/demo/README.md b/demo/README.md
index c017c434543c..c85aa6b10192 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -10,7 +10,6 @@ Contents
 --------
 - [Code Examples](#code-examples)
   - [Features Walkthrough](#features-walkthrough)
-  - [Basic Examples by Tasks](#basic-examples-by-tasks)
   - [Benchmarks](#benchmarks)
 - [Machine Learning Challenge Winning Solutions](#machine-learning-challenge-winning-solutions)
 - [Tutorials](#tutorials)
@@ -22,16 +21,18 @@ Contents
 
 Code Examples
 -------------
+
 ### Features Walkthrough
 
 _Note: for the R package, see the in-package examples and vignettes instead_
 
+_Note: For the Python package, see [Feature Walk through](https://xgboost.readthedocs.io/en/stable/python/examples/index.html)._
+
 This is a list of short codes introducing different functionalities of xgboost packages.
 
 * Basic walkthrough of packages
   [python](guide-python/basic_walkthrough.py)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
-  [PHP](https://github.com/bpachev/xgboost-php/blob/master/demo/titanic_demo.php)
 * Customize loss function, and evaluation metric
   [python](guide-python/custom_objective.py)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/custom_objective.jl)
@@ -50,16 +51,6 @@ This is a list of short codes introducing different functionalities of xgboost p
 * Predicting leaf indices
   [python](guide-python/predict_leaf_indices.py)
 
-### Basic Examples by Tasks
-
-Most of examples in this section are based on CLI or python version.
-However, the parameter settings can be applied to all versions
-
-- [Binary classification](CLI/binary_classification)
-- [Multiclass classification](multiclass_classification)
-- [Regression](CLI/regression)
-- [Learning to Rank](rank)
-
 ### Benchmarks
 
 - [Starter script for Kaggle Higgs Boson](kaggle-higgs)
diff --git a/demo/data/regression/README.md b/demo/data/regression/README.md
new file mode 100644
index 000000000000..b7553623b112
--- /dev/null
+++ b/demo/data/regression/README.md
@@ -0,0 +1,4 @@
+Regression
+==========
+
+The dataset is the [computer hardware dataset from UCI repository](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware).
diff --git a/demo/CLI/regression/machine.data b/demo/data/regression/machine.data
similarity index 100%
rename from demo/CLI/regression/machine.data
rename to demo/data/regression/machine.data
diff --git a/demo/CLI/regression/machine.names b/demo/data/regression/machine.names
similarity index 100%
rename from demo/CLI/regression/machine.names
rename to demo/data/regression/machine.names
diff --git a/demo/CLI/regression/mapfeat.py b/demo/data/regression/mapfeat.py
similarity index 100%
rename from demo/CLI/regression/mapfeat.py
rename to demo/data/regression/mapfeat.py
diff --git a/demo/CLI/regression/mknfold.py b/demo/data/regression/mknfold.py
similarity index 100%
rename from demo/CLI/regression/mknfold.py
rename to demo/data/regression/mknfold.py
diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py
index 95a3d4e7adf9..ce82d8f59777 100644
--- a/dev/prepare_jvm_release.py
+++ b/dev/prepare_jvm_release.py
@@ -144,16 +144,16 @@ def main():
 
     with cd("jvm-packages/"):
         print("====Copying resources for testing====")
-        with cd("../demo/CLI/regression"):
+        with cd("../demo/data/regression"):
             run(f"{sys.executable} mapfeat.py")
             run(f"{sys.executable} mknfold.py machine.txt 1")
         xgboost4j_spark = "xgboost4j-spark-gpu" if use_cuda else "xgboost4j-spark"
-        maybe_makedirs(f"xgboost4j/src/test/resources")
+        maybe_makedirs("xgboost4j/src/test/resources")
         maybe_makedirs(f"{xgboost4j_spark}/src/test/resources")
         for file in glob.glob("../demo/data/agaricus.*"):
-            cp(file, f"xgboost4j/src/test/resources")
+            cp(file, "xgboost4j/src/test/resources")
             cp(file, f"{xgboost4j_spark}/src/test/resources")
-        for file in glob.glob("../demo/CLI/regression/machine.txt.t*"):
+        for file in glob.glob("../demo/data/regression/machine.txt.t*"):
             cp(file, f"{xgboost4j_spark}/src/test/resources")
 
         print("====Creating directories to hold native binaries====")
diff --git a/doc/cli.rst b/doc/cli.rst
deleted file mode 100644
index aff6f30beaec..000000000000
--- a/doc/cli.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-############################
-XGBoost Command Line version
-############################
-
-See `XGBoost Command Line walkthrough <https://github.com/dmlc/xgboost/tree/master/demo/CLI/binary_classification>`_.
diff --git a/doc/index.rst b/doc/index.rst
index 6f37aa464eca..964381d28a04 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -32,6 +32,5 @@ Contents
   Julia Package <julia>
   C Package <c>
   C++ Interface <c++>
-  CLI Interface <cli>
   contrib/index
   changes/index
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 702c7cf63ba4..c5f0681aefd2 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -6,7 +6,6 @@ Before running XGBoost, we must set three types of parameters: general parameter
 - **General parameters** relate to which booster we are using to do boosting, commonly tree or linear model
 - **Booster parameters** depend on which booster you have chosen
 - **Learning task parameters** decide on the learning scenario. For example, regression tasks may use different parameters with ranking tasks.
-- **Command line parameters** relate to behavior of CLI version of XGBoost.
 
 .. note:: Parameters in R package
 
@@ -573,65 +572,4 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra
 
 * ``ndcg_exp_gain`` [default = ``true``]
 
-  Whether we should use exponential gain function for ``NDCG``. There are two forms of gain function for ``NDCG``, one is using relevance value directly while the other is using :math:`2^{rel} - 1` to emphasize on retrieving relevant documents. When ``ndcg_exp_gain`` is true (the default), relevance degree cannot be greater than 31.
-
-***********************
-Command Line Parameters
-***********************
-The following parameters are only used in the console version of XGBoost. The CLI has been
-deprecated and will be removed in future releases.
-
-* ``num_round``
-
-  - The number of rounds for boosting
-
-* ``data``
-
-  - The path of training data
-
-* ``test:data``
-
-  - The path of test data to do prediction
-
-* ``save_period`` [default=0]
-
-  - The period to save the model. Setting ``save_period=10`` means that for every 10 rounds XGBoost will save the model. Setting it to 0 means not saving any model during the training.
-
-* ``task`` [default= ``train``] options: ``train``, ``pred``, ``eval``, ``dump``
-
-  - ``train``: training using data
-  - ``pred``: making prediction for test:data
-  - ``eval``: for evaluating statistics specified by ``eval[name]=filename``
-  - ``dump``: for dump the learned model into text format
-
-* ``model_in`` [default=NULL]
-
-  - Path to input model, needed for ``test``, ``eval``, ``dump`` tasks. If it is specified in training, XGBoost will continue training from the input model.
-
-* ``model_out`` [default=NULL]
-
-  - Path to output model after training finishes. If not specified, XGBoost will output files with such names as ``0003.model`` where ``0003`` is number of boosting rounds.
-
-* ``model_dir`` [default= ``models/``]
-
-  - The output directory of the saved models during training
-
-* ``fmap``
-
-  - Feature map, used for dumping model
-
-* ``dump_format`` [default= ``text``] options: ``text``, ``json``
-
-  - Format of model dump file
-
-* ``name_dump`` [default= ``dump.txt``]
-
-  - Name of model dump file
-
-* ``name_pred`` [default= ``pred.txt``]
-
-  - Name of prediction file, used in pred mode
-
-* ``pred_margin`` [default=0]
-
-  - Predict margin instead of transformed probability
+  Whether we should use exponential gain function for ``NDCG``. There are two forms of gain function for ``NDCG``, one is using relevance value directly while the other is using :math:`2^{rel} - 1` to emphasize on retrieving relevant documents. When ``ndcg_exp_gain`` is true (the default), relevance degree cannot be greater than 31.
\ No newline at end of file
diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py
index 2708ff7a1904..5f2ca9f88890 100755
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@@ -168,10 +168,10 @@ def native_build(cli_args: argparse.Namespace) -> None:
 
     # for xgboost4j-spark
     maybe_makedirs("xgboost4j-spark/src/test/resources")
-    with cd("../demo/CLI/regression"):
+    with cd("../demo/data/regression"):
         run(f'"{sys.executable}" mapfeat.py')
         run(f'"{sys.executable}" mknfold.py machine.txt 1')
-    for file in glob.glob("../demo/CLI/regression/machine.txt.t*"):
+    for file in glob.glob("../demo/data/regression/machine.txt.t*"):
         cp(file, "xgboost4j-spark/src/test/resources")
     for file in glob.glob("../demo/data/agaricus.*"):
         cp(file, "xgboost4j-spark/src/test/resources")
diff --git a/ops/pipeline/build-cpu-impl.sh b/ops/pipeline/build-cpu-impl.sh
index 55e205d3edfa..0ed2ee5a5e8e 100755
--- a/ops/pipeline/build-cpu-impl.sh
+++ b/ops/pipeline/build-cpu-impl.sh
@@ -24,7 +24,6 @@ case "${suite}" in
       -DUSE_DMLC_GTEST=ON \
       -DENABLE_ALL_WARNINGS=ON \
       -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
-      -DBUILD_DEPRECATED_CLI=ON \
       -DCMAKE_PREFIX_PATH='/opt/grpc' \
       -DPLUGIN_FEDERATED=ON
     time ninja -v
@@ -40,7 +39,6 @@ case "${suite}" in
       -DUSE_DMLC_GTEST=ON \
       -DENABLE_ALL_WARNINGS=ON \
       -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
-      -DBUILD_DEPRECATED_CLI=ON \
       -DUSE_SANITIZER=ON \
       -DENABLED_SANITIZERS="address;leak;undefined" \
       -DCMAKE_BUILD_TYPE=Debug \
diff --git a/ops/pipeline/build-cuda-impl.sh b/ops/pipeline/build-cuda-impl.sh
index 1ec9550b7ed3..473e239388a6 100755
--- a/ops/pipeline/build-cuda-impl.sh
+++ b/ops/pipeline/build-cuda-impl.sh
@@ -45,7 +45,6 @@ cmake .. \
   -DUSE_DMLC_GTEST=ON \
   -DENABLE_ALL_WARNINGS=ON \
   -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
-  -DBUILD_DEPRECATED_CLI=ON \
   ${cmake_args}
 time ninja -v
 popd
diff --git a/ops/pipeline/build-python-wheels-arm64-impl.sh b/ops/pipeline/build-python-wheels-arm64-impl.sh
index 5cac6c259962..263df67f1bd2 100755
--- a/ops/pipeline/build-python-wheels-arm64-impl.sh
+++ b/ops/pipeline/build-python-wheels-arm64-impl.sh
@@ -18,8 +18,7 @@ cmake .. \
   -DGOOGLE_TEST=ON \
   -DUSE_DMLC_GTEST=ON \
   -DENABLE_ALL_WARNINGS=ON \
-  -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
-  -DBUILD_DEPRECATED_CLI=ON
+  -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF
 time ninja -v
 
 echo "--- Run Google Test"
diff --git a/ops/pipeline/build-test-cpu-nonomp.sh b/ops/pipeline/build-test-cpu-nonomp.sh
index 5bd6fa7f9d32..ee61c7c9cf76 100755
--- a/ops/pipeline/build-test-cpu-nonomp.sh
+++ b/ops/pipeline/build-test-cpu-nonomp.sh
@@ -12,8 +12,7 @@ cmake .. \
   -DGOOGLE_TEST=ON \
   -DUSE_DMLC_GTEST=ON \
   -DENABLE_ALL_WARNINGS=ON \
-  -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
-  -DBUILD_DEPRECATED_CLI=ON
+  -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF
 time ninja -v
 ctest --extra-verbose
 popd
diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1
index dd99f77aa8a9..e73d7352ce93 100644
--- a/ops/pipeline/build-win64-gpu.ps1
+++ b/ops/pipeline/build-win64-gpu.ps1
@@ -19,7 +19,7 @@ git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet
 mkdir build
 cd build
 cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON `
-  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON `
+  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON `
   -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" ${arch_flag}
 if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 cmake --build . --config Release -- /m /nodeReuse:false `
diff --git a/ops/pipeline/test-python-macos.sh b/ops/pipeline/test-python-macos.sh
index 63b5690d1312..5c69fc4b7277 100755
--- a/ops/pipeline/test-python-macos.sh
+++ b/ops/pipeline/test-python-macos.sh
@@ -10,7 +10,7 @@ pushd build
 # Set prefix, to use OpenMP library from Conda env
 # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228
 # to learn why we don't use libomp from Homebrew.
-cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON
+cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
 ninja
 popd
 
diff --git a/ops/script/inject_jvm_lib.sh b/ops/script/inject_jvm_lib.sh
index 82584aeaca92..ac84e98d8980 100755
--- a/ops/script/inject_jvm_lib.sh
+++ b/ops/script/inject_jvm_lib.sh
@@ -16,16 +16,15 @@ mkdir -p jvm-packages/xgboost4j/src/test/resources
 mkdir -p jvm-packages/xgboost4j-spark/src/test/resources
 mkdir -p jvm-packages/xgboost4j-spark-gpu/src/test/resources
 
-# Generate machine.txt.* files from the CLI regression demo
-# TODO(hcho3): Remove once CLI is removed
-pushd demo/CLI/regression
+# Generate machine.txt.* files from the regression demo
+pushd demo/data/regression
 python3 mapfeat.py
 python3 mknfold.py machine.txt 1
 popd
 
 cp -v demo/data/agaricus.* \
   jvm-packages/xgboost4j/src/test/resources
-cp -v demo/CLI/regression/machine.txt.t* demo/data/agaricus.* \
+cp -v demo/data/regression/machine.txt.t* demo/data/agaricus.* \
   jvm-packages/xgboost4j-spark/src/test/resources
 cp -v demo/data/veterans_lung_cancer.csv \
   jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv \
diff --git a/src/cli_main.cc b/src/cli_main.cc
deleted file mode 100644
index 5750121cba91..000000000000
--- a/src/cli_main.cc
+++ /dev/null
@@ -1,520 +0,0 @@
-/*!
- * Copyright 2014-2020 by Contributors
- * \file cli_main.cc
- * \brief The command line interface program of xgboost.
- *  This file is not included in dynamic library.
- */
-#include <dmlc/timer.h>
-#include <xgboost/base.h>
-#include <xgboost/data.h>
-#include <xgboost/json.h>
-#include <xgboost/learner.h>
-#include <xgboost/logging.h>
-#include <xgboost/parameter.h>
-
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <iomanip>
-#include <string>
-#include <vector>
-
-#include "c_api/c_api_utils.h"
-#include "common/common.h"
-#include "common/config.h"
-#include "common/io.h"
-#include "common/version.h"
-
-namespace xgboost {
-enum CLITask {
-  kTrain = 0,
-  kDumpModel = 1,
-  kPredict = 2
-};
-
-struct CLIParam : public XGBoostParameter<CLIParam> {
-  /*! \brief the task name */
-  int task;
-  /*! \brief whether evaluate training statistics */
-  bool eval_train;
-  /*! \brief number of boosting iterations */
-  int num_round;
-  /*! \brief the period to save the model, 0 means only save the final round model */
-  int save_period;
-  /*! \brief the path of training set */
-  std::string train_path;
-  /*! \brief path of test dataset */
-  std::string test_path;
-  /*! \brief the path of test model file, or file to restart training */
-  std::string model_in;
-  /*! \brief the path of final model file, to be saved */
-  std::string model_out;
-  /*! \brief the path of directory containing the saved models */
-  std::string model_dir;
-  /*! \brief name of predict file */
-  std::string name_pred;
-  /*! \brief data split mode */
-  int dsplit;
-  /*!\brief limit number of trees in prediction */
-  int ntree_limit;
-  int iteration_begin;
-  int iteration_end;
-  /*!\brief whether to directly output margin value */
-  bool pred_margin;
-  /*! \brief whether dump statistics along with model */
-  int dump_stats;
-  /*! \brief what format to dump the model in */
-  std::string dump_format;
-  /*! \brief name of feature map */
-  std::string name_fmap;
-  /*! \brief name of dump file */
-  std::string name_dump;
-  /*! \brief the paths of validation data sets */
-  std::vector<std::string> eval_data_paths;
-  /*! \brief the names of the evaluation data used in output log */
-  std::vector<std::string> eval_data_names;
-  /*! \brief all the configurations */
-  std::vector<std::pair<std::string, std::string> > cfg;
-
-  static constexpr char const* const kNull = "NULL";
-
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(CLIParam) {
-    // NOTE: declare everything except eval_data_paths.
-    DMLC_DECLARE_FIELD(task).set_default(kTrain)
-        .add_enum("train", kTrain)
-        .add_enum("dump", kDumpModel)
-        .add_enum("pred", kPredict)
-        .describe("Task to be performed by the CLI program.");
-    DMLC_DECLARE_FIELD(eval_train).set_default(false)
-        .describe("Whether evaluate on training data during training.");
-    DMLC_DECLARE_FIELD(num_round).set_default(10).set_lower_bound(1)
-        .describe("Number of boosting iterations");
-    DMLC_DECLARE_FIELD(save_period).set_default(0).set_lower_bound(0)
-        .describe("The period to save the model, 0 means only save final model.");
-    DMLC_DECLARE_FIELD(train_path).set_default("NULL")
-        .describe("Training data path.");
-    DMLC_DECLARE_FIELD(test_path).set_default("NULL")
-        .describe("Test data path.");
-    DMLC_DECLARE_FIELD(model_in).set_default("NULL")
-        .describe("Input model path, if any.");
-    DMLC_DECLARE_FIELD(model_out).set_default("NULL")
-        .describe("Output model path, if any.");
-    DMLC_DECLARE_FIELD(model_dir).set_default("./")
-        .describe("Output directory of period checkpoint.");
-    DMLC_DECLARE_FIELD(name_pred).set_default("pred.txt")
-        .describe("Name of the prediction file.");
-    DMLC_DECLARE_FIELD(dsplit).set_default(0)
-        .add_enum("row", 0)
-        .add_enum("col", 1)
-        .describe("Data split mode.");
-    DMLC_DECLARE_FIELD(ntree_limit).set_default(0).set_lower_bound(0)
-        .describe("(Deprecated) Use iteration_begin/iteration_end instead.");
-    DMLC_DECLARE_FIELD(iteration_begin).set_default(0).set_lower_bound(0)
-        .describe("Begining of boosted tree iteration used for prediction.");
-    DMLC_DECLARE_FIELD(iteration_end).set_default(0).set_lower_bound(0)
-        .describe("End of boosted tree iteration used for prediction.  0 means all the trees.");
-    DMLC_DECLARE_FIELD(pred_margin).set_default(false)
-        .describe("Whether to predict margin value instead of probability.");
-    DMLC_DECLARE_FIELD(dump_stats).set_default(false)
-        .describe("Whether dump the model statistics.");
-    DMLC_DECLARE_FIELD(dump_format).set_default("text")
-        .describe("What format to dump the model in.");
-    DMLC_DECLARE_FIELD(name_fmap).set_default("NULL")
-        .describe("Name of the feature map file.");
-    DMLC_DECLARE_FIELD(name_dump).set_default("dump.txt")
-        .describe("Name of the output dump text file.");
-    // alias
-    DMLC_DECLARE_ALIAS(train_path, data);
-    DMLC_DECLARE_ALIAS(test_path, test:data);
-    DMLC_DECLARE_ALIAS(name_fmap, fmap);
-  }
-  // customized configure function of CLIParam
-  inline void Configure(const std::vector<std::pair<std::string, std::string> >& _cfg) {
-    // Don't copy the configuration to enable parameter validation.
-    auto unknown_cfg = this->UpdateAllowUnknown(_cfg);
-    this->cfg.emplace_back("validate_parameters", "True");
-    for (const auto& kv : unknown_cfg) {
-      if (!strncmp("eval[", kv.first.c_str(), 5)) {
-        char evname[256];
-        CHECK_EQ(sscanf(kv.first.c_str(), "eval[%[^]]", evname), 1)
-            << "must specify evaluation name for display";
-        eval_data_names.emplace_back(evname);
-        eval_data_paths.push_back(kv.second);
-      } else {
-        this->cfg.emplace_back(kv);
-      }
-    }
-    // constraint.
-    if (name_pred == "stdout") {
-      save_period = 0;
-    }
-  }
-};
-
-constexpr char const* const CLIParam::kNull;
-
-DMLC_REGISTER_PARAMETER(CLIParam);
-
-std::string CliHelp() {
-  return "Use xgboost -h for showing help information.\n";
-}
-
-void CLIError(dmlc::Error const& e) {
-  std::cerr << "Error running xgboost:\n\n"
-            << e.what() << "\n"
-            << CliHelp()
-            << std::endl;
-}
-
-class CLI {
-  CLIParam param_;
-  std::unique_ptr<Learner> learner_;
-  enum Print {
-    kNone,
-    kVersion,
-    kHelp
-  } print_info_ {kNone};
-
-  void ResetLearner(std::vector<std::shared_ptr<DMatrix>> const &matrices) {
-    learner_.reset(Learner::Create(matrices));
-    if (param_.model_in != CLIParam::kNull) {
-      this->LoadModel(param_.model_in, learner_.get());
-      learner_->SetParams(param_.cfg);
-    } else {
-      learner_->SetParams(param_.cfg);
-    }
-    learner_->Configure();
-  }
-
-  void CLITrain() {
-    const double tstart_data_load = dmlc::GetTime();
-    // load in data.
-    std::shared_ptr<DMatrix> dtrain(DMatrix::Load(
-        param_.train_path, ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(),
-        static_cast<DataSplitMode>(param_.dsplit)));
-    std::vector<std::shared_ptr<DMatrix>> deval;
-    std::vector<std::shared_ptr<DMatrix>> cache_mats;
-    std::vector<std::shared_ptr<DMatrix>> eval_datasets;
-    cache_mats.push_back(dtrain);
-    for (size_t i = 0; i < param_.eval_data_names.size(); ++i) {
-      deval.emplace_back(std::shared_ptr<DMatrix>(
-          DMatrix::Load(param_.eval_data_paths[i],
-                        ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(),
-                        static_cast<DataSplitMode>(param_.dsplit))));
-      eval_datasets.push_back(deval.back());
-      cache_mats.push_back(deval.back());
-    }
-    std::vector<std::string> eval_data_names = param_.eval_data_names;
-    if (param_.eval_train) {
-      eval_datasets.push_back(dtrain);
-      eval_data_names.emplace_back("train");
-    }
-    // initialize the learner.
-    this->ResetLearner(cache_mats);
-    LOG(INFO) << "Loading data: " << dmlc::GetTime() - tstart_data_load
-              << " sec";
-
-    // start training.
-    const double start = dmlc::GetTime();
-    int32_t version = 0;
-    for (int i = version / 2; i < param_.num_round; ++i) {
-      double elapsed = dmlc::GetTime() - start;
-      if (version % 2 == 0) {
-        LOG(INFO) << "boosting round " << i << ", " << elapsed
-                  << " sec elapsed";
-        learner_->UpdateOneIter(i, dtrain);
-        version += 1;
-      }
-      std::string res = learner_->EvalOneIter(i, eval_datasets, eval_data_names);
-      LOG(CONSOLE) << res;
-
-      if (param_.save_period != 0 && (i + 1) % param_.save_period == 0) {
-        std::ostringstream os;
-        os << param_.model_dir << '/' << std::setfill('0') << std::setw(4)
-           << i + 1 << ".ubj";
-        this->SaveModel(os.str(), learner_.get());
-      }
-
-      version += 1;
-    }
-    LOG(INFO) << "Complete Training loop time: " << dmlc::GetTime() - start
-              << " sec";
-    // always save final round
-    if ((param_.save_period == 0 ||
-         param_.num_round % param_.save_period != 0)) {
-      std::ostringstream os;
-      if (param_.model_out == CLIParam::kNull) {
-        os << param_.model_dir << '/' << std::setfill('0') << std::setw(4)
-           << param_.num_round << ".ubj";
-      } else {
-        os << param_.model_out;
-      }
-      this->SaveModel(os.str(), learner_.get());
-    }
-
-    double elapsed = dmlc::GetTime() - start;
-    LOG(INFO) << "update end, " << elapsed << " sec in all";
-  }
-
-  void CLIDumpModel() {
-    FeatureMap fmap;
-    if (param_.name_fmap != CLIParam::kNull) {
-      std::unique_ptr<dmlc::Stream> fs(
-          dmlc::Stream::Create(param_.name_fmap.c_str(), "r"));
-      dmlc::istream is(fs.get());
-      fmap.LoadText(is);
-    }
-    // load model
-    CHECK_NE(param_.model_in, CLIParam::kNull) << "Must specify model_in for dump";
-    this->ResetLearner({});
-
-    // dump data
-    std::vector<std::string> dump =
-        learner_->DumpModel(fmap, param_.dump_stats, param_.dump_format);
-    std::unique_ptr<dmlc::Stream> fo(
-        dmlc::Stream::Create(param_.name_dump.c_str(), "w"));
-    dmlc::ostream os(fo.get());
-    if (param_.dump_format == "json") {
-      os << "[" << std::endl;
-      for (size_t i = 0; i < dump.size(); ++i) {
-        if (i != 0) {
-          os << "," << std::endl;
-        }
-        os << dump[i];  // Dump the previously generated JSON here
-      }
-      os << std::endl << "]" << std::endl;
-    } else {
-      for (size_t i = 0; i < dump.size(); ++i) {
-        os << "booster[" << i << "]:\n";
-        os << dump[i];
-      }
-    }
-    // force flush before fo destruct.
-    os.set_stream(nullptr);
-  }
-
-  void CLIPredict() {
-    CHECK_NE(param_.test_path, CLIParam::kNull)
-        << "Test dataset parameter test:data must be specified.";
-    // load data
-    std::shared_ptr<DMatrix> dtest(DMatrix::Load(
-        param_.test_path,
-        ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(),
-        static_cast<DataSplitMode>(param_.dsplit)));
-    // load model
-    CHECK_NE(param_.model_in, CLIParam::kNull) << "Must specify model_in for predict";
-    this->ResetLearner({});
-
-    LOG(INFO) << "Start prediction...";
-    HostDeviceVector<bst_float> preds;
-    if (param_.ntree_limit != 0) {
-      param_.iteration_end = GetIterationFromTreeLimit(param_.ntree_limit, learner_.get());
-      LOG(WARNING) << "`ntree_limit` is deprecated, use `iteration_begin` and "
-                      "`iteration_end` instead.";
-    }
-    learner_->Predict(dtest, param_.pred_margin, &preds, param_.iteration_begin,
-                      param_.iteration_end);
-    LOG(CONSOLE) << "Writing prediction to " << param_.name_pred;
-
-    std::unique_ptr<dmlc::Stream> fo(
-        dmlc::Stream::Create(param_.name_pred.c_str(), "w"));
-    dmlc::ostream os(fo.get());
-    for (bst_float p : preds.ConstHostVector()) {
-      os << std::setprecision(std::numeric_limits<bst_float>::max_digits10) << p
-         << '\n';
-    }
-    // force flush before fo destruct.
-    os.set_stream(nullptr);
-  }
-
-  void LoadModel(std::string const& path, Learner* learner) const {
-    auto ext = common::FileExtension(path);
-    auto read_file = [&]() {
-      auto str = common::LoadSequentialFile(path);
-      CHECK_GE(str.size(), 3);  // "{}\0"
-      CHECK_EQ(str[0], '{');
-      return str;
-    };
-
-    if (ext == "json") {
-      auto buffer = read_file();
-      Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
-      learner->LoadModel(in);
-    } else if (ext == "ubj") {
-      auto buffer = read_file();
-      Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
-      learner->LoadModel(in);
-    } else {
-      LOG(FATAL) << "Unknown model format:" << path
-                 << ", expecting either UBJSON (`ubj`) or JSON (`json`).";
-    }
-  }
-
-  void SaveModel(std::string const& path, Learner* learner) const {
-    learner->Configure();
-    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(path.c_str(), "w"));
-    auto ext = common::FileExtension(path);
-    auto save_json = [&](std::ios::openmode mode) {
-      Json out{Object()};
-      learner->SaveModel(&out);
-      std::vector<char> str;
-      Json::Dump(out, &str, mode);
-      fo->Write(str.data(), str.size());
-    };
-
-    if (ext == "json") {
-      save_json(std::ios::out);
-    } else if (ext == "ubj") {
-      save_json(std::ios::binary);
-    } else {
-      LOG(FATAL) << "Unknown model format:" << path << ", expecting either json or ubj.";
-    }
-  }
-
-  void PrintHelp() const {
-    std::cout << "Usage: xgboost [ -h ] [ -V ] [ config file ] [ arguments ]" << std::endl;
-    std::stringstream ss;
-    ss << R"(
-  Options and arguments:
-
-    -h, --help
-       Print this message.
-
-    -V, --version
-       Print XGBoost version.
-
-    arguments
-       Extra parameters that are not specified in config file, see below.
-
-  Config file specifies the configuration for both training and testing.  Each line
-  containing the [attribute] = [value] configuration.
-
-  General XGBoost parameters:
-
-    https://xgboost.readthedocs.io/en/latest/parameter.html
-
-  Command line interface specfic parameters:
-
-)";
-
-    std::string help = param_.__DOC__();
-    auto splited = common::Split(help, '\n');
-    for (auto str : splited) {
-      ss << "    " << str << '\n';
-    }
-    ss << R"(    eval[NAME]: string, optional, default='NULL'
-        Path to evaluation data, with NAME as data name.
-)";
-
-    ss << R"(
-  Example:  train.conf
-
-    # General parameters
-    booster = gbtree
-    objective = reg:squarederror
-    eta = 1.0
-    gamma = 1.0
-    seed = 0
-    min_child_weight = 0
-    max_depth = 3
-
-    # Training arguments for CLI.
-    num_round = 2
-    save_period = 0
-    data = "demo/data/agaricus.txt.train?format=libsvm"
-    eval[test] = "demo/data/agaricus.txt.test?format=libsvm"
-
-  See demo/ directory in XGBoost for more examples.
-)";
-    std::cout << ss.str() << std::endl;
-  }
-
-  void PrintVersion() const {
-    auto ver = Version::String(Version::Self());
-    std::cout << "XGBoost: " << ver << std::endl;
-  }
-
- public:
-  CLI(int argc, char* argv[]) {
-    if (argc < 2) {
-      this->PrintHelp();
-      exit(1);
-    }
-    for (int i = 0; i < argc; ++i) {
-      std::string str {argv[i]};
-      if (str == "-h" || str == "--help") {
-        print_info_ = kHelp;
-        break;
-      } else if (str == "-V" || str == "--version") {
-        print_info_ = kVersion;
-        break;
-      }
-    }
-    if (print_info_ != kNone) {
-      return;
-    }
-
-    std::string config_path = argv[1];
-
-    common::ConfigParser cp(config_path);
-    auto cfg = cp.Parse();
-
-    for (int i = 2; i < argc; ++i) {
-      char name[256], val[256];
-      if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
-        cfg.emplace_back(std::string(name), std::string(val));
-      }
-    }
-
-    param_.Configure(cfg);
-  }
-
-  int Run() {
-    switch (this->print_info_) {
-    case kNone:
-      break;
-    case kVersion: {
-      this->PrintVersion();
-      return 0;
-    }
-    case kHelp: {
-      this->PrintHelp();
-      return 0;
-    }
-    }
-
-    try {
-      switch (param_.task) {
-      case kTrain:
-        CLITrain();
-        break;
-      case kDumpModel:
-        CLIDumpModel();
-        break;
-      case kPredict:
-        CLIPredict();
-        break;
-      }
-    } catch (dmlc::Error const& e) {
-      xgboost::CLIError(e);
-      return 1;
-    }
-    return 0;
-  }
-};
-}  // namespace xgboost
-
-int main(int argc, char* argv[]) {
-  LOG(WARNING)
-      << "The command line interface is deprecated and will be removed in future releases.";
-  try {
-    xgboost::CLI cli(argc, argv);
-    return cli.Run();
-  } catch (dmlc::Error const& e) {
-    // This captures only the initialization error.
-    xgboost::CLIError(e);
-    return 1;
-  }
-  return 0;
-}
diff --git a/src/common/config.h b/src/common/config.h
deleted file mode 100644
index c8b98eb77f99..000000000000
--- a/src/common/config.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*!
- * Copyright 2014-2019 by Contributors
- * \file config.h
- * \brief helper class to load in configures from file
- * \author Haoda Fu, Hyunsu Cho
- */
-#ifndef XGBOOST_COMMON_CONFIG_H_
-#define XGBOOST_COMMON_CONFIG_H_
-
-#include <string>
-#include <fstream>
-#include <istream>
-#include <sstream>
-#include <vector>
-#include <regex>
-#include <iterator>
-#include <utility>
-
-#include "xgboost/logging.h"
-
-namespace xgboost {
-namespace common {
-/*!
- * \brief Implementation of config reader
- */
-class ConfigParser {
- public:
-  /*!
-   * \brief Constructor for INI-style configuration parser
-   * \param path path to configuration file
-   */
-  explicit ConfigParser(const std::string path)
-      : path_(std::move(path)),
-      line_comment_regex_("^#"),
-      key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"),
-      key_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*=)rx"),
-      value_regex_(R"rx(^([^#"'\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx"),
-      value_regex_escaped_(R"rx(^(["'])([^"'\r\n]+)\1[\t ]*(?:#.*){0,1}$)rx")
-  {}
-
-  std::string LoadConfigFile(const std::string& path) {
-    std::ifstream fin(path, std::ios_base::in | std::ios_base::binary);
-    CHECK(fin) << "Failed to open config file: \"" << path << "\"";
-    try {
-      std::string content{std::istreambuf_iterator<char>(fin),
-                          std::istreambuf_iterator<char>()};
-      return content;
-    } catch (std::ios_base::failure const &e) {
-      LOG(FATAL) << "Failed to read config file: \"" << path << "\"\n"
-                 << e.what();
-    }
-    return "";
-  }
-
-  /*!
-   * \brief Normalize end-of-line in a file so that it uses LF for all
-   *        line endings.
-   *
-   * This is needed because some OSes use CR or CR LF instead.  So we
-   * replace all CR with LF.
-   *
-   * \param p_config_str pointer to configuration
-   */
-  std::string NormalizeConfigEOL(std::string const& config_str) {
-    std::string result;
-    std::stringstream ss(config_str);
-    for (auto c : config_str) {
-      if (c == '\r') {
-        result.push_back('\n');
-        continue;
-      }
-      result.push_back(c);
-    }
-    return result;
-  }
-
-  /*!
-   * \brief Parse configuration file into key-value pairs.
-   * \param path path to configuration file
-   * \return list of key-value pairs
-   */
-  std::vector<std::pair<std::string, std::string>> Parse() {
-    std::string content { LoadConfigFile(path_) };
-    content = NormalizeConfigEOL(content);
-    std::stringstream ss { content };
-    std::vector<std::pair<std::string, std::string>> results;
-    std::string line;
-    std::string key, value;
-    // Loop over every line of the configuration file
-    while (std::getline(ss, line)) {
-      if (ParseKeyValuePair(line, &key, &value)) {
-        results.emplace_back(key, value);
-      }
-    }
-    return results;
-  }
-
- private:
-  std::string path_;
-  const std::regex line_comment_regex_, key_regex_, key_regex_escaped_,
-    value_regex_, value_regex_escaped_;
-
- public:
-  /*!
-   * \brief Remove leading and trailing whitespaces from a given string
-   * \param str string
-   * \return Copy of str with leading and trailing whitespaces removed
-   */
-  static std::string TrimWhitespace(const std::string& str) {
-    const auto first_char = str.find_first_not_of(" \t\n\r");
-    const auto last_char = str.find_last_not_of(" \t\n\r");
-    if (first_char == std::string::npos) {
-      // Every character in str is a whitespace
-      return {};
-    }
-    CHECK_NE(last_char, std::string::npos);
-    const auto substr_len = last_char + 1 - first_char;
-    return str.substr(first_char, substr_len);
-  }
-
-  /*!
-   * \brief Parse a key-value pair from a string representing a line
-   * \param str string (cannot be multi-line)
-   * \param key place to store the key, if parsing is successful
-   * \param value place to store the value, if parsing is successful
-   * \return Whether the parsing was successful
-   */
-  bool ParseKeyValuePair(const std::string& str, std::string* key,
-                         std::string* value) {
-    std::string buf = TrimWhitespace(str);
-    if (buf.empty()) {
-      return false;
-    }
-
-    /* Match key */
-    std::smatch m;
-    if (std::regex_search(buf, m, line_comment_regex_)) {
-      // This line is a comment
-      return false;
-    } else if (std::regex_search(buf, m, key_regex_)) {
-      // Key doesn't have whitespace or #
-      CHECK_EQ(m.size(), 2);
-      *key = m[1].str();
-    } else if (std::regex_search(buf, m, key_regex_escaped_)) {
-      // Key has a whitespace and/or #; it has to be wrapped around a pair of
-      // single or double quotes. Example: "foo bar"  'foo#bar'
-      CHECK_EQ(m.size(), 3);
-      *key = m[2].str();
-    } else {
-      LOG(FATAL) << "This line is not a valid key-value pair: " << str;
-    }
-
-    /* Match value */
-    buf = m.suffix().str();
-    buf = TrimWhitespace(buf);
-    if (std::regex_search(buf, m, value_regex_)) {
-      // Value doesn't have whitespace or #
-      CHECK_EQ(m.size(), 2);
-      *value = m[1].str();
-    } else if (std::regex_search(buf, m, value_regex_escaped_)) {
-      // Value has a whitespace and/or #; it has to be wrapped around a pair of
-      // single or double quotes. Example: "foo bar"  'foo#bar'
-      CHECK_EQ(m.size(), 3);
-      *value = m[2].str();
-    } else {
-      LOG(FATAL) << "This line is not a valid key-value pair: " << str;
-    }
-    return true;
-  }
-};
-
-}  // namespace common
-}  // namespace xgboost
-#endif  // XGBOOST_COMMON_CONFIG_H_
diff --git a/tests/cli/machine.conf.in b/tests/cli/machine.conf.in
deleted file mode 100644
index e9575261ac42..000000000000
--- a/tests/cli/machine.conf.in
+++ /dev/null
@@ -1,13 +0,0 @@
-# Originally an example in demo/regression/
-booster = gbtree
-objective = reg:squarederror
-eta = 1.0
-gamma = 1.0
-seed = 0
-min_child_weight = 0
-max_depth = 3
-
-num_round = 2
-save_period = 0
-data = "@PROJECT_SOURCE_DIR@/demo/data/agaricus.txt.train?format=libsvm"
-eval[test] = "@PROJECT_SOURCE_DIR@/demo/data/agaricus.txt.test?format=libsvm"
\ No newline at end of file
diff --git a/tests/cpp/common/test_config.cc b/tests/cpp/common/test_config.cc
deleted file mode 100644
index 2ce021a34c38..000000000000
--- a/tests/cpp/common/test_config.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/**
- * Copyright 2019-2025, XGBoost Contributors
- */
-#include <gtest/gtest.h>
-
-#include <fstream>
-#include <string>
-
-#include "../../../src/common/config.h"
-#include "../filesystem.h"  // for TemporaryDirectory
-#include "../helpers.h"
-
-namespace xgboost {
-namespace common {
-
-TEST(ConfigParser, NormalizeConfigEOL) {
-  // Test whether strings with NL are loaded correctly.
-  common::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.Str() + "/my.conf";
-  /* Old Mac OS uses \r for line ending */
-  {
-    std::string const input = "foo\rbar\rdog\r";
-    std::string const output = "foo\nbar\ndog\n";
-    {
-      std::ofstream fp(
-          tmp_file,
-          std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
-      fp << input;
-    }
-    {
-      ConfigParser parser(tmp_file);
-      auto content = parser.LoadConfigFile(tmp_file);
-      content = parser.NormalizeConfigEOL(content);
-      ASSERT_EQ(content, output);
-    }
-  }
-  /* Windows uses \r\n for line ending */
-  {
-    std::string const input = "foo\r\nbar\r\ndog\r\n";
-    std::string const output = "foo\n\nbar\n\ndog\n\n";
-    {
-      std::ofstream fp(tmp_file,
-                       std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
-      fp << input;
-    }
-    {
-      ConfigParser parser(tmp_file);
-      auto content = parser.LoadConfigFile(tmp_file);
-      content = parser.NormalizeConfigEOL(content);
-      ASSERT_EQ(content, output);
-    }
-  }
-}
-
-TEST(ConfigParser, TrimWhitespace) {
-  ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar"), "foo bar");
-  ASSERT_EQ(ConfigParser::TrimWhitespace("  foo bar"), "foo bar");
-  ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar  "), "foo bar");
-  ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar\t"), "foo bar");
-  ASSERT_EQ(ConfigParser::TrimWhitespace("   foo bar  "), "foo bar");
-  ASSERT_EQ(ConfigParser::TrimWhitespace("\t\t  foo bar  \t"), "foo bar");
-  ASSERT_EQ(ConfigParser::TrimWhitespace("\tabc\t"), "abc");
-  ASSERT_EQ(ConfigParser::TrimWhitespace("\r abc\t"), "abc");
-}
-
-TEST(ConfigParser, ParseKeyValuePair) {
-  // Create dummy configuration file
-  common::TemporaryDirectory tempdir;
-  const std::string tmp_file = tempdir.Str() + "/my.conf";
-  {
-    std::ofstream fp(tmp_file);
-    fp << "";
-  }
-
-  ConfigParser parser(tmp_file);
-
-  std::string key, value;
-  // 1. Empty lines or comments
-  ASSERT_FALSE(parser.ParseKeyValuePair("# Mary had a little lamb",
-                                        &key, &value));
-  ASSERT_FALSE(parser.ParseKeyValuePair("#tree_method = gpu_hist",
-                                        &key, &value));
-  ASSERT_FALSE(parser.ParseKeyValuePair(
-                 "# minimum sum of instance weight(hessian) needed in a child",
-                 &key, &value));
-  ASSERT_FALSE(parser.ParseKeyValuePair("", &key, &value));
-
-  // 2. Key-value pairs
-  ASSERT_TRUE(parser.ParseKeyValuePair("booster = gbtree", &key, &value));
-  ASSERT_EQ(key, "booster");
-  ASSERT_EQ(value, "gbtree");
-  ASSERT_TRUE(parser.ParseKeyValuePair("gpu_id = 2", &key, &value));
-  ASSERT_EQ(key, "gpu_id");
-  ASSERT_EQ(value, "2");
-  ASSERT_TRUE(parser.ParseKeyValuePair("monotone_constraints = (1,0,-1)",
-                                       &key, &value));
-  ASSERT_EQ(key, "monotone_constraints");
-  ASSERT_EQ(value, "(1,0,-1)");
-  // whitespace should not matter
-  ASSERT_TRUE(parser.ParseKeyValuePair("  objective=binary:logistic",
-                                       &key, &value));
-  ASSERT_EQ(key, "objective");
-  ASSERT_EQ(value, "binary:logistic");
-  ASSERT_TRUE(parser.ParseKeyValuePair("tree_method\t=\thist  ", &key, &value));
-  ASSERT_EQ(key, "tree_method");
-  ASSERT_EQ(value, "hist");
-
-  // 3. Use of forward and backward slashes in value
-  ASSERT_TRUE(parser.ParseKeyValuePair("test:data = test/data.libsvm",
-                                       &key, &value));
-  ASSERT_EQ(key, "test:data");
-  ASSERT_EQ(value, "test/data.libsvm");
-  ASSERT_TRUE(parser.ParseKeyValuePair("data = C:\\data.libsvm", &key, &value));
-  ASSERT_EQ(key, "data");
-  ASSERT_EQ(value, "C:\\data.libsvm");
-
-  // 4. One-line comment
-  ASSERT_TRUE(parser.ParseKeyValuePair("learning_rate = 0.3   # small step",
-                                       &key, &value));
-  ASSERT_EQ(key, "learning_rate");
-  ASSERT_EQ(value, "0.3");
-  // Note: '#' in path won't be accepted correctly unless the whole path is
-  // wrapped with quotes. This is important for external memory.
-  ASSERT_TRUE(parser.ParseKeyValuePair("data = dmatrix.libsvm#dtrain.cache",
-                                       &key, &value));
-  ASSERT_EQ(key, "data");
-  ASSERT_EQ(value, "dmatrix.libsvm");  // cache was silently ignored
-
-  // 5. Wrapping key/value with quotes
-  // Any key or value containing '#' needs to be wrapped with quotes
-  ASSERT_TRUE(parser.ParseKeyValuePair("data = \"dmatrix.libsvm#dtrain.cache\"",
-                                       &key, &value));
-  ASSERT_EQ(key, "data");
-  ASSERT_EQ(value, "dmatrix.libsvm#dtrain.cache");  // cache is now kept
-  ASSERT_TRUE(parser.ParseKeyValuePair(
-                "data = \"C:\\Administrator\\train_file.txt#trainbincache\"",
-                &key, &value));
-  ASSERT_EQ(key, "data");
-  ASSERT_EQ(value, "C:\\Administrator\\train_file.txt#trainbincache");
-  ASSERT_TRUE(parser.ParseKeyValuePair("\'month#day\' = \"November#2019\"",
-                                       &key, &value));
-  ASSERT_EQ(key, "month#day");
-  ASSERT_EQ(value, "November#2019");
-  // Likewise, key or value containing a space needs to be quoted
-  ASSERT_TRUE(parser.ParseKeyValuePair("\"my data\" = \' so precious!  \'",
-                                       &key, &value));
-  ASSERT_EQ(key, "my data");
-  ASSERT_EQ(value, " so precious!  ");
-  ASSERT_TRUE(parser.ParseKeyValuePair("interaction_constraints = "
-                                       "\"[[0, 2], [1, 3, 4], [5, 6]]\"",
-                                       &key, &value));
-  ASSERT_EQ(key, "interaction_constraints");
-  ASSERT_EQ(value, "[[0, 2], [1, 3, 4], [5, 6]]");
-
-  // 6. Unicode
-  ASSERT_TRUE(parser.ParseKeyValuePair("클래스상속 = 类继承", &key, &value));
-  ASSERT_EQ(key, "클래스상속");
-  ASSERT_EQ(value, "类继承");
-
-  // 7. Ill-formed data should throw exception
-  for (const char* str : {"data = C:\\My Documents\\cat.csv", "cow=",
-                          "C# = 100%", "= woof ",
-                          "interaction_constraints = [[0, 2], [1]]",
-                          "data = \"train.txt#cache",
-                          "data = \'train.txt#cache", "foo = \'bar\""}) {
-    ASSERT_THROW(parser.ParseKeyValuePair(str, &key, &value), dmlc::Error);
-  }
-}
-
-}  // namespace common
-}  // namespace xgboost
diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py
deleted file mode 100644
index e484b0065fd1..000000000000
--- a/tests/python/test_cli.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import json
-import os
-import platform
-import subprocess
-import tempfile
-
-import numpy
-import pytest
-
-import xgboost
-from xgboost import testing as tm
-
-
-class TestCLI:
-    template = '''
-booster = gbtree
-objective = reg:squarederror
-eta = 1.0
-gamma = 1.0
-seed = {seed}
-min_child_weight = 0
-max_depth = 3
-task = {task}
-model_in = {model_in}
-model_out = {model_out}
-test_path = {test_path}
-name_pred = {name_pred}
-model_dir = {model_dir}
-
-num_round = 10
-data = {data_path}
-eval[test] = {data_path}
-'''
-
-    PROJECT_ROOT = tm.project_root(__file__)
-
-    def get_exe(self):
-        if platform.system() == 'Windows':
-            exe = 'xgboost.exe'
-        else:
-            exe = 'xgboost'
-        exe = os.path.join(self.PROJECT_ROOT, exe)
-        if not os.path.exists(exe):
-            pytest.skip("CLI executable not found.")
-        return exe
-
-    def test_cli_model(self):
-        data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format(
-            root=self.PROJECT_ROOT)
-        exe = self.get_exe()
-        seed = 1994
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model_out_cli = os.path.join(
-                tmpdir, 'test_load_cli_model-cli.json')
-            model_out_py = os.path.join(
-                tmpdir, 'test_cli_model-py.json')
-            config_path = os.path.join(
-                tmpdir, 'test_load_cli_model.conf')
-
-            train_conf = self.template.format(data_path=data_path,
-                                              seed=seed,
-                                              task='train',
-                                              model_in='NULL',
-                                              model_out=model_out_cli,
-                                              test_path='NULL',
-                                              name_pred='NULL',
-                                              model_dir='NULL')
-            with open(config_path, 'w') as fd:
-                fd.write(train_conf)
-
-            subprocess.run([exe, config_path])
-
-            predict_out = os.path.join(tmpdir,
-                                       'test_load_cli_model-prediction')
-            predict_conf = self.template.format(task='pred',
-                                                seed=seed,
-                                                data_path=data_path,
-                                                model_in=model_out_cli,
-                                                model_out='NULL',
-                                                test_path=data_path,
-                                                name_pred=predict_out,
-                                                model_dir='NULL')
-            with open(config_path, 'w') as fd:
-                fd.write(predict_conf)
-
-            subprocess.run([exe, config_path])
-
-            cli_predt = numpy.loadtxt(predict_out)
-
-            parameters = {
-                'booster': 'gbtree',
-                'objective': 'reg:squarederror',
-                'eta': 1.0,
-                'gamma': 1.0,
-                'seed': seed,
-                'min_child_weight': 0,
-                'max_depth': 3
-            }
-            data = xgboost.DMatrix(data_path)
-            booster = xgboost.train(parameters, data, num_boost_round=10)
-
-            # CLI model doesn't contain feature info.
-            booster.feature_names = None
-            booster.feature_types = None
-            booster.set_attr(best_iteration=None)
-
-            booster.save_model(model_out_py)
-            py_predt = booster.predict(data)
-
-            numpy.testing.assert_allclose(cli_predt, py_predt)
-
-            cli_model = xgboost.Booster(model_file=model_out_cli)
-            cli_predt = cli_model.predict(data)
-            numpy.testing.assert_allclose(cli_predt, py_predt)
-
-            with open(model_out_cli, 'rb') as fd:
-                cli_model_bin = fd.read()
-            with open(model_out_py, 'rb') as fd:
-                py_model_bin = fd.read()
-
-            assert hash(cli_model_bin) == hash(py_model_bin)
-
-    def test_cli_help(self):
-        exe = self.get_exe()
-        completed = subprocess.run([exe], stdout=subprocess.PIPE)
-        error_msg = completed.stdout.decode('utf-8')
-        ret = completed.returncode
-        assert ret == 1
-        assert error_msg.find('Usage') != -1
-        assert error_msg.find('eval[NAME]') != -1
-
-        completed = subprocess.run([exe, '-V'], stdout=subprocess.PIPE)
-        msg = completed.stdout.decode('utf-8')
-        assert msg.find('XGBoost') != -1
-        v = xgboost.__version__
-        if v.find('dev') != -1:
-            assert msg.split(':')[1].strip() == v.split('-')[0]
-        elif v.find('rc') != -1:
-            assert msg.split(':')[1].strip() == v.split('rc')[0]
-        else:
-            assert msg.split(':')[1].strip() == v
-
-    def test_cli_model_json(self):
-        exe = self.get_exe()
-        data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format(
-            root=self.PROJECT_ROOT)
-        seed = 1994
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model_out_cli = os.path.join(
-                tmpdir, 'test_load_cli_model-cli.json')
-            config_path = os.path.join(tmpdir, 'test_load_cli_model.conf')
-
-            train_conf = self.template.format(data_path=data_path,
-                                              seed=seed,
-                                              task='train',
-                                              model_in='NULL',
-                                              model_out=model_out_cli,
-                                              test_path='NULL',
-                                              name_pred='NULL',
-                                              model_dir='NULL')
-            with open(config_path, 'w') as fd:
-                fd.write(train_conf)
-
-            subprocess.run([exe, config_path])
-            with open(model_out_cli, 'r') as fd:
-                model = json.load(fd)
-
-            assert model['learner']['gradient_booster']['name'] == 'gbtree'
-
-    def test_cli_save_model(self):
-        '''Test save on final round'''
-        exe = self.get_exe()
-        data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format(
-            root=self.PROJECT_ROOT)
-        seed = 1994
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model_out_cli = os.path.join(tmpdir, '0010.ubj')
-            config_path = os.path.join(tmpdir, 'test_load_cli_model.conf')
-
-            train_conf = self.template.format(data_path=data_path,
-                                              seed=seed,
-                                              task='train',
-                                              model_in='NULL',
-                                              model_out='NULL',
-                                              test_path='NULL',
-                                              name_pred='NULL',
-                                              model_dir=tmpdir)
-            with open(config_path, 'w') as fd:
-                fd.write(train_conf)
-
-            subprocess.run([exe, config_path])
-            assert os.path.exists(model_out_cli)
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 0c75e0f59279..7d4e1f132a72 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -15,9 +15,12 @@
 CLI_DEMO_DIR = os.path.join(DEMO_DIR, "CLI")
 
 
+PYTHON = sys.executable
+
+
 def test_basic_walkthrough() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "basic_walkthrough.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     with tempfile.TemporaryDirectory() as tmpdir:
         subprocess.check_call(cmd, cwd=tmpdir)
 
@@ -25,42 +28,42 @@ def test_basic_walkthrough() -> None:
 @pytest.mark.skipif(**tm.no_pandas())
 def test_categorical() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "categorical.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_pandas())
 def test_cat_pipeline() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "cat_pipeline.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_multiclass_objective() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "custom_softmax.py")
-    cmd = ["python", script, "--plot=0"]
+    cmd = [PYTHON, script, "--plot=0"]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_rmsle_objective() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "custom_rmsle.py")
-    cmd = ["python", script, "--plot=0"]
+    cmd = [PYTHON, script, "--plot=0"]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_matplotlib())
 def test_feature_weights_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "feature_weights.py")
-    cmd = ["python", script, "--plot=0"]
+    cmd = [PYTHON, script, "--plot=0"]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_sklearn())
 def test_sklearn_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "sklearn_examples.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
     assert os.path.exists("best_calif.pkl")
     os.remove("best_calif.pkl")
@@ -70,68 +73,68 @@ def test_sklearn_demo() -> None:
 @pytest.mark.timeout(60)
 def test_sklearn_parallel_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "sklearn_parallel.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_sklearn())
 def test_sklearn_evals_result_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "sklearn_evals_result.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 def test_boost_from_prediction_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "boost_from_prediction.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 def test_predict_first_ntree_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "predict_first_ntree.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 def test_individual_trees() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "individual_trees.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 def test_predict_leaf_indices_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "predict_leaf_indices.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 def test_generalized_linear_model_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "generalized_linear_model.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 def test_cross_validation_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "cross_validation.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
 def test_external_memory_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "external_memory.py")
-    cmd = ["python", script, "--device=cpu"]
+    cmd = [PYTHON, script, "--device=cpu"]
     subprocess.check_call(cmd)
 
 
 def test_distributed_extmem_basic_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "distributed_extmem_basic.py")
-    cmd = ["python", script, "--device=cpu"]
+    cmd = [PYTHON, script, "--device=cpu"]
     subprocess.check_call(cmd)
 
 
 def test_evals_result_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "evals_result.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
@@ -139,7 +142,7 @@ def test_evals_result_demo() -> None:
 @pytest.mark.skipif(**tm.no_pandas())
 def test_aft_demo() -> None:
     script = os.path.join(DEMO_DIR, "aft_survival", "aft_survival_demo.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
     assert os.path.exists("aft_model.json")
     os.remove("aft_model.json")
@@ -148,13 +151,13 @@ def test_aft_demo() -> None:
 @pytest.mark.skipif(**tm.no_matplotlib())
 def test_callbacks_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "callbacks.py")
-    cmd = ["python", script, "--plot=0"]
+    cmd = [PYTHON, script, "--plot=0"]
     subprocess.check_call(cmd)
 
 
 def test_continuation_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "continuation.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
@@ -162,14 +165,14 @@ def test_continuation_demo() -> None:
 @pytest.mark.skipif(**tm.no_matplotlib())
 def test_multioutput_reg() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "multioutput_regression.py")
-    cmd = ["python", script, "--plot=0"]
+    cmd = [PYTHON, script, "--plot=0"]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_sklearn())
 def test_quantile_reg() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "quantile_regression.py")
-    cmd = ["python", script]
+    cmd = [PYTHON, script]
     subprocess.check_call(cmd)
 
 
@@ -181,12 +184,12 @@ def run_test(reg: xgboost.XGBRegressor) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, "reg.json")
             reg.save_model(path)
-            cmd = ["python", script, f"--model={path}"]
+            cmd = [PYTHON, script, f"--model={path}"]
             subprocess.check_call(cmd)
 
             path = os.path.join(tmpdir, "reg.ubj")
             reg.save_model(path)
-            cmd = ["python", script, f"--model={path}"]
+            cmd = [PYTHON, script, f"--model={path}"]
             subprocess.check_call(cmd)
 
     # numerical
@@ -216,37 +219,3 @@ def run_test(reg: xgboost.XGBRegressor) -> None:
 # - gamma regression is not tested as it requires running a R script first.
 # - aft viz is not tested due to ploting is not controlled
 # - aft tunning is not tested due to extra dependency.
-
-
-def test_cli_regression_demo() -> None:
-    reg_dir = os.path.join(CLI_DEMO_DIR, "regression")
-    script = os.path.join(reg_dir, "mapfeat.py")
-    cmd = ["python", script]
-    subprocess.check_call(cmd, cwd=reg_dir)
-
-    script = os.path.join(reg_dir, "mknfold.py")
-    cmd = ["python", script, "machine.txt", "1"]
-    subprocess.check_call(cmd, cwd=reg_dir)
-
-    exe = os.path.join(DEMO_DIR, os.path.pardir, "xgboost")
-    if not os.path.exists(exe):
-        pytest.skip("CLI executable not found.")
-    conf = os.path.join(reg_dir, "machine.conf")
-    subprocess.check_call([exe, conf], cwd=reg_dir)
-
-
-@pytest.mark.skipif(
-    condition=sys.platform.startswith("win"), reason="Test requires sh execution."
-)
-def test_cli_binary_classification() -> None:
-    cls_dir = os.path.join(CLI_DEMO_DIR, "binary_classification")
-    exe = os.path.join(DEMO_DIR, os.path.pardir, "xgboost")
-    if not os.path.exists(exe):
-        pytest.skip("CLI executable not found.")
-    with tm.DirectoryExcursion(cls_dir, cleanup=True):
-        subprocess.check_call(["./runexp.sh"])
-        os.remove("0002.ubj")
-
-
-# year prediction is not tested due to data size being too large.
-# rank is not tested as it requires unrar command.

From 3fab478633c65916e1457274a7aaf4e64bbca774 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <dmitry.razdoburdin@intel.com>
Date: Thu, 9 Oct 2025 08:33:29 +0200
Subject: [PATCH 188/224] [SYCL]. Fixes for the upcoming v3.1 (#11725)

---------

Co-authored-by: Dmitry Razdoburdin <>
---
 include/xgboost/linalg.h                 |  14 ++--
 plugin/sycl/common/host_device_vector.cc |   2 +
 plugin/sycl/common/linalg_op.cc          |  52 ++++++++++++
 plugin/sycl/common/optional_weight.cc    |  31 +++++++
 plugin/sycl/device_properties.h          |   3 +
 plugin/sycl/predictor/node.h             |  69 ++++++++++++++++
 plugin/sycl/predictor/predictor.cc       | 101 +++++++++--------------
 src/common/linalg_op.cc                  |  23 +++++-
 src/common/linalg_op.h                   |  14 +++-
 src/common/optional_weight.cc            |  17 +++-
 src/common/optional_weight.h             |   8 +-
 src/common/ranking_utils.cc              |   3 +-
 src/common/ranking_utils.cu              |   2 +-
 src/learner.cc                           |   4 +-
 src/metric/rank_metric.cc                |   6 +-
 src/metric/rank_metric.cu                |   6 +-
 src/objective/lambdarank_obj.cc          |  12 +--
 src/objective/lambdarank_obj.cu          |   2 +-
 src/objective/multiclass_obj.cu          |  22 +++--
 src/objective/regression_obj.cu          |   8 ++
 src/predictor/predictor.cc               |  13 ++-
 tests/cpp/common/test_linalg.cu          |   2 +-
 tests/cpp/common/test_optional_weight.cc |   4 +-
 tests/cpp/plugin/test_sycl_linalg.cc     |  47 +++++++++++
 tests/cpp/plugin/test_sycl_predictor.cc  |   8 +-
 tests/cpp/predictor/test_predictor.cc    |   7 +-
 26 files changed, 357 insertions(+), 123 deletions(-)
 create mode 100644 plugin/sycl/common/linalg_op.cc
 create mode 100644 plugin/sycl/common/optional_weight.cc
 create mode 100644 plugin/sycl/predictor/node.h
 create mode 100644 tests/cpp/plugin/test_sycl_linalg.cc

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 848a248048eb..106251674a63 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -591,13 +591,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span<T, ext> data,
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
-  auto span = ctx->IsCUDA() ? data->DeviceSpan() : data->HostSpan();
+  auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
-  auto span = ctx->IsCUDA() ? data->ConstDeviceSpan() : data->ConstHostSpan();
+  auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
@@ -647,13 +647,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
-  return MakeVec(data->Device().IsCUDA() ? data->DevicePointer() : data->HostPointer(),
+  return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(),
                  data->Size(), data->Device());
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
-  return MakeVec(data->Device().IsCUDA() ? data->ConstDevicePointer() : data->ConstHostPointer(),
+  return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
                  data->Size(), data->Device());
 }
 
@@ -759,7 +759,7 @@ class Tensor {
     for (auto i = D; i < kDim; ++i) {
       shape_[i] = 1;
     }
-    if (device.IsCUDA()) {
+    if (!device.IsCPU()) {
       data_.SetDevice(device);
       data_.ConstDevicePointer();  // Pull to device;
     }
@@ -788,11 +788,11 @@ class Tensor {
       shape_[i] = 1;
     }
     auto size = detail::CalcSize(shape_);
-    if (device.IsCUDA()) {
+    if (!device.IsCPU()) {
       data_.SetDevice(device);
     }
     data_.Resize(size);
-    if (device.IsCUDA()) {
+    if (!device.IsCPU()) {
       data_.DevicePointer();  // Pull to device
     }
   }
diff --git a/plugin/sycl/common/host_device_vector.cc b/plugin/sycl/common/host_device_vector.cc
index 6e4756ec35bd..bca5aee45f6e 100644
--- a/plugin/sycl/common/host_device_vector.cc
+++ b/plugin/sycl/common/host_device_vector.cc
@@ -16,6 +16,7 @@
 
 #include "../device_manager.h"
 #include "../data.h"
+#include "../predictor/node.h"
 
 namespace xgboost {
 template <typename T>
@@ -405,6 +406,7 @@ template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<bst_idx_t>;
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
+template class HostDeviceVector<sycl::predictor::Node>;
 
 }  // namespace xgboost
 
diff --git a/plugin/sycl/common/linalg_op.cc b/plugin/sycl/common/linalg_op.cc
new file mode 100644
index 000000000000..55eca035ced8
--- /dev/null
+++ b/plugin/sycl/common/linalg_op.cc
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2021-2025, XGBoost Contributors
+ * \file linalg_op.h
+ */
+
+#include "../data.h"
+#include "../device_manager.h"
+
+#include "../../../src/common/optional_weight.h"  // for OptionalWeights
+#include "xgboost/context.h"  // for Context
+
+#include <sycl/sycl.hpp>
+
+namespace xgboost::sycl::linalg {
+
+void SmallHistogram(Context const* ctx, xgboost::linalg::MatrixView<float const> indices,
+                    xgboost::common::OptionalWeights const& weights,
+                    xgboost::linalg::VectorView<float> bins) {
+  sycl::DeviceManager device_manager;
+  auto* qu = device_manager.GetQueue(ctx->Device());
+
+  qu->submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(indices.Size()),
+                       [=](::sycl::id<1> pid) {
+      const size_t i = pid[0];
+      auto y = indices(i);
+      auto w = weights[i];
+      AtomicRef<float> bin_val(const_cast<float&>(bins(static_cast<std::size_t>(y))));
+      bin_val += w;
+    });
+  }).wait();
+}
+
+void VecScaMul(Context const* ctx, xgboost::linalg::VectorView<float> x, double mul) {
+  sycl::DeviceManager device_manager;
+  auto* qu = device_manager.GetQueue(ctx->Device());
+
+  qu->submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(x.Size()),
+                       [=](::sycl::id<1> pid) {
+      const size_t i = pid[0];
+      const_cast<float&>(x(i)) *= mul;
+    });
+  }).wait();
+}
+}  // namespace xgboost::sycl::linalg
+
+namespace xgboost::linalg::sycl_impl {
+void VecScaMul(Context const* ctx, xgboost::linalg::VectorView<float> x, double mul) {
+  xgboost::sycl::linalg::VecScaMul(ctx, x, mul);
+}
+}  // namespace xgboost::linalg::sycl_impl
diff --git a/plugin/sycl/common/optional_weight.cc b/plugin/sycl/common/optional_weight.cc
new file mode 100644
index 000000000000..aa984a152dc3
--- /dev/null
+++ b/plugin/sycl/common/optional_weight.cc
@@ -0,0 +1,31 @@
+/*!
+ * Copyright by Contributors 2017-2025
+ */
+#include <sycl/sycl.hpp>
+
+#include "../../../src/common/optional_weight.h"
+
+#include "../device_manager.h"
+
+namespace xgboost::common::sycl_impl {
+double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights) {
+  sycl::DeviceManager device_manager;
+  auto* qu = device_manager.GetQueue(ctx->Device());
+
+  const auto* data = weights.Data();
+  double result = 0;
+  {
+    ::sycl::buffer<double> buff(&result, 1);
+    qu->submit([&](::sycl::handler& cgh) {
+      auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
+      cgh.parallel_for<>(::sycl::range<1>(weights.Size()), reduction,
+                        [=](::sycl::id<1> pid, auto& sum) {
+        size_t i = pid[0];
+        sum += data[i];
+      });
+    }).wait_and_throw();
+  }
+
+  return result;
+}
+}  // namespace xgboost::common::sycl_impl
diff --git a/plugin/sycl/device_properties.h b/plugin/sycl/device_properties.h
index 0b0bc90fbff4..96f258737c2b 100644
--- a/plugin/sycl/device_properties.h
+++ b/plugin/sycl/device_properties.h
@@ -47,6 +47,9 @@ class DeviceProperties {
   size_t l2_size = 0;
   float l2_size_per_eu = 0;
 
+  DeviceProperties():
+    is_gpu(false) {}
+
   explicit DeviceProperties(const ::sycl::device& device):
     is_gpu(device.is_gpu()),
     usm_host_allocations(device.has(::sycl::aspect::usm_host_allocations)),
diff --git a/plugin/sycl/predictor/node.h b/plugin/sycl/predictor/node.h
new file mode 100644
index 000000000000..feed8b3123dd
--- /dev/null
+++ b/plugin/sycl/predictor/node.h
@@ -0,0 +1,69 @@
+/*!
+ * Copyright by Contributors 2017-2025
+ * \file node.h
+ */
+#ifndef PLUGIN_SYCL_PREDICTOR_NODE_H_
+#define PLUGIN_SYCL_PREDICTOR_NODE_H_
+
+#include "../../src/gbm/gbtree_model.h"
+
+namespace xgboost {
+namespace sycl {
+namespace predictor {
+
+union NodeValue {
+  float leaf_weight;
+  float fvalue;
+};
+
+class Node {
+  int fidx;
+  int left_child_idx;
+  int right_child_idx;
+  NodeValue val;
+
+ public:
+  Node() = default;
+
+  explicit Node(const RegTree::Node& n) {
+    left_child_idx = n.LeftChild();
+    right_child_idx = n.RightChild();
+    fidx = n.SplitIndex();
+    if (n.DefaultLeft()) {
+      fidx |= (1U << 31);
+    }
+
+    if (n.IsLeaf()) {
+      val.leaf_weight = n.LeafValue();
+    } else {
+      val.fvalue = n.SplitCond();
+    }
+  }
+
+  int LeftChildIdx() const {return left_child_idx; }
+
+  int RightChildIdx() const {return right_child_idx; }
+
+  bool IsLeaf() const { return left_child_idx == -1; }
+
+  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
+
+  bool MissingLeft() const { return (fidx >> 31) != 0; }
+
+  int MissingIdx() const {
+    if (MissingLeft()) {
+      return left_child_idx;
+    } else {
+      return right_child_idx;
+    }
+  }
+
+  float GetFvalue() const { return val.fvalue; }
+
+  float GetWeight() const { return val.leaf_weight; }
+};
+
+}  // namespace predictor
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_PREDICTOR_NODE_H_
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index dc58951038ef..442b70adfddb 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -1,5 +1,5 @@
 /*!
- * Copyright by Contributors 2017-2023
+ * Copyright by Contributors 2017-2025
  */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wtautological-constant-compare"
@@ -30,6 +30,23 @@
 
 #include "../device_manager.h"
 #include "../device_properties.h"
+#include "node.h"
+
+namespace xgboost::sycl_impl {
+void InitOutPredictions(Context const* ctx, linalg::VectorView<float const> base_score,
+                        linalg::MatrixView<float> predt) {
+  sycl::DeviceManager device_manager;
+  auto* qu = device_manager.GetQueue(predt.Device());
+  qu->submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(predt.Size()),
+                       [=](::sycl::id<1> pid) {
+      size_t k = pid[0];
+      auto [i, j] = xgboost::linalg::UnravelIndex(k, predt.Shape());
+      const_cast<float&>(predt(i, j)) = base_score(j);
+    });
+  }).wait_and_throw();
+}
+}  // namespace xgboost::sycl_impl
 
 namespace xgboost {
 namespace sycl {
@@ -37,68 +54,19 @@ namespace predictor {
 
 DMLC_REGISTRY_FILE_TAG(predictor_sycl);
 
-union NodeValue {
-  float leaf_weight;
-  float fvalue;
-};
-
-class Node {
-  int fidx;
-  int left_child_idx;
-  int right_child_idx;
-  NodeValue val;
-
- public:
-  explicit Node(const RegTree::Node& n) {
-    left_child_idx = n.LeftChild();
-    right_child_idx = n.RightChild();
-    fidx = n.SplitIndex();
-    if (n.DefaultLeft()) {
-      fidx |= (1U << 31);
-    }
-
-    if (n.IsLeaf()) {
-      val.leaf_weight = n.LeafValue();
-    } else {
-      val.fvalue = n.SplitCond();
-    }
-  }
-
-  int LeftChildIdx() const {return left_child_idx; }
-
-  int RightChildIdx() const {return right_child_idx; }
-
-  bool IsLeaf() const { return left_child_idx == -1; }
-
-  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
-
-  bool MissingLeft() const { return (fidx >> 31) != 0; }
-
-  int MissingIdx() const {
-    if (MissingLeft()) {
-      return left_child_idx;
-    } else {
-      return right_child_idx;
-    }
-  }
-
-  float GetFvalue() const { return val.fvalue; }
-
-  float GetWeight() const { return val.leaf_weight; }
-};
-
 class DeviceModel {
  public:
-  USMVector<Node> nodes;
+  HostDeviceVector<Node> nodes;
   HostDeviceVector<size_t> first_node_position;
   HostDeviceVector<int> tree_group;
 
   void SetDevice(DeviceOrd device) {
+    nodes.SetDevice(device);
     first_node_position.SetDevice(device);
     tree_group.SetDevice(device);
   }
 
-  void Init(::sycl::queue* qu, const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
     int n_nodes = 0;
     first_node_position.Resize((tree_end - tree_begin) + 1);
     auto& first_node_position_host = first_node_position.HostVector();
@@ -111,12 +79,12 @@ class DeviceModel {
       first_node_position_host[tree_idx - tree_begin + 1] = n_nodes;
     }
 
-    nodes.Resize(qu, n_nodes);
+    nodes.Resize(n_nodes);
     for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
       auto& src_nodes = model.trees[tree_idx]->GetNodes();
       size_t n_nodes_shift = first_node_position_host[tree_idx - tree_begin];
       for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++) {
-        nodes[node_idx + n_nodes_shift] = static_cast<Node>(src_nodes[node_idx]);
+        nodes.HostVector()[node_idx + n_nodes_shift] = static_cast<Node>(src_nodes[node_idx]);
       }
     }
 
@@ -204,16 +172,19 @@ class Predictor : public xgboost::Predictor {
  public:
   explicit Predictor(Context const* context) :
       xgboost::Predictor::Predictor{context},
-      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)),
-      qu_(device_manager.GetQueue(context->Device())),
-      device_prop_(qu_->get_device()) {
-        device_model.SetDevice(context->Device());
-      }
+      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) {}
 
   void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
                     const gbm::GBTreeModel &model, bst_tree_t tree_begin,
                     bst_tree_t tree_end = 0) const override {
     auto* out_preds = &predts->predictions;
+    device_model.SetDevice(ctx_->Device());
+    qu_ = device_manager.GetQueue(ctx_->Device());
+    if (device_ != ctx_->Device()) {
+      device_ = ctx_->Device();
+      device_prop_ = DeviceProperties(qu_->get_device());
+    }
+
     out_preds->SetDevice(ctx_->Device());
     if (tree_end == 0) {
       tree_end = model.trees.size();
@@ -328,7 +299,7 @@ class Predictor : public xgboost::Predictor {
                            size_t tree_begin,
                            size_t tree_end,
                            float sparsity) const {
-    const Node* nodes = device_model.nodes.DataConst();
+    const Node* nodes = device_model.nodes.ConstDevicePointer();
     const size_t* first_node_position = device_model.first_node_position.ConstDevicePointer();
     const int* tree_group = device_model.tree_group.ConstDevicePointer();
 
@@ -385,7 +356,7 @@ class Predictor : public xgboost::Predictor {
                      size_t tree_begin,
                      size_t tree_end,
                      float sparsity) const {
-    const Node* nodes = device_model.nodes.DataConst();
+    const Node* nodes = device_model.nodes.ConstDevicePointer();
     const size_t* first_node_position = device_model.first_node_position.ConstDevicePointer();
     const int* tree_group = device_model.tree_group.ConstDevicePointer();
 
@@ -458,7 +429,7 @@ class Predictor : public xgboost::Predictor {
     if (tree_end - tree_begin == 0) return;
     if (out_preds->Size() == 0) return;
 
-    device_model.Init(qu_, model, tree_begin, tree_end);
+    device_model.Init(model, tree_begin, tree_end);
 
     int num_group = model.learner_model_param->num_output_group;
     int num_features = dmat->Info().num_col_;
@@ -475,6 +446,7 @@ class Predictor : public xgboost::Predictor {
         const auto base_rowid = batch.base_rowid;
 
         float sparsity = static_cast<float>(batch.data.Size()) / (batch_size * num_features);
+
         if (UseFvalueBuffer<any_missing>(tree_begin, tree_end, num_features)) {
           PredictKernelBufferDispatch<any_missing>(&event, data,
                                                    out_predictions + base_rowid * num_group,
@@ -491,11 +463,12 @@ class Predictor : public xgboost::Predictor {
     qu_->wait();
   }
 
+  mutable xgboost::DeviceOrd device_;
   mutable DeviceModel device_model;
   DeviceManager device_manager;
 
   mutable ::sycl::queue* qu_ = nullptr;
-  DeviceProperties device_prop_;
+  mutable DeviceProperties device_prop_;
 
   std::unique_ptr<xgboost::Predictor> cpu_predictor;
 };
diff --git a/src/common/linalg_op.cc b/src/common/linalg_op.cc
index 4a68fedf37e7..43a3af14ce15 100644
--- a/src/common/linalg_op.cc
+++ b/src/common/linalg_op.cc
@@ -8,10 +8,23 @@
 #include "optional_weight.h"  // for OptionalWeights
 #include "xgboost/context.h"  // for Context
 
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
 #include "common.h"  // for AssertGPUSupport
 #endif
 
+namespace xgboost::sycl::linalg {
+void SmallHistogram(Context const* ctx, xgboost::linalg::MatrixView<float const> indices,
+                    common::OptionalWeights const& weights,
+                    xgboost::linalg::VectorView<float> bins);
+#if !defined(XGBOOST_USE_SYCL)
+void SmallHistogram(Context const*, xgboost::linalg::MatrixView<float const>,
+                    common::OptionalWeights const&,
+                    xgboost::linalg::VectorView<float>) {
+  common::AssertSYCLSupport();
+}
+#endif
+}  // namespace xgboost::sycl::linalg
+
 namespace xgboost::linalg {
 namespace cuda_impl {
 void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
@@ -27,14 +40,16 @@ void SmallHistogram(Context const*, linalg::MatrixView<float const>, common::Opt
 void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
                     common::OptionalWeights const& weights, linalg::VectorView<float> bins) {
   auto n = indices.Size();
-  if (!ctx->IsCUDA()) {
+  if (ctx->IsCUDA()) {
+    cuda_impl::SmallHistogram(ctx, indices, weights, bins);
+  } else if (ctx->IsSycl()) {
+    sycl::linalg::SmallHistogram(ctx, indices, weights, bins);
+  } else {
     for (std::size_t i = 0; i < n; ++i) {
       auto y = indices(i);
       auto w = weights[i];
       bins(static_cast<std::size_t>(y)) += w;
     }
-  } else {
-    cuda_impl::SmallHistogram(ctx, indices, weights, bins);
   }
 }
 }  // namespace xgboost::linalg
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index 889747ccb3dc..ef5e4dec00b5 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -15,12 +15,12 @@
 #include "xgboost/json.h"        // for Json
 #include "xgboost/linalg.h"
 
-#if !defined(XGBOOST_USE_CUDA)
+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
 
 #include "common.h"           // for AssertGPUSupport
 #include "xgboost/context.h"  // for Context
 
-#endif  // !defined(XGBOOST_USE_CUDA)
+#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
 
 namespace xgboost::common {
 struct OptionalWeights;
@@ -118,6 +118,10 @@ namespace cuda_impl {
 void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul);
 }  // namespace cuda_impl
 
+namespace sycl_impl {
+void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul);
+}  // namespace sycl_impl
+
 // vector-scalar multiplication
 inline void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul) {
   CHECK_EQ(x.Device().ordinal, ctx->Device().ordinal);
@@ -126,6 +130,12 @@ inline void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mu
     cuda_impl::VecScaMul(ctx, x, mul);
 #else
     common::AssertGPUSupport();
+#endif
+  } else if (x.Device().IsSycl()) {
+#if defined(XGBOOST_USE_SYCL)
+    sycl_impl::VecScaMul(ctx, x, mul);
+#else
+    common::AssertSYCLSupport();
 #endif
   } else {
     constexpr std::size_t kBlockSize = 2048;
diff --git a/src/common/optional_weight.cc b/src/common/optional_weight.cc
index 40bb1bff4636..a22de40c1a88 100644
--- a/src/common/optional_weight.cc
+++ b/src/common/optional_weight.cc
@@ -8,12 +8,8 @@
 #include "xgboost/base.h"     // for bst_idx_t
 #include "xgboost/context.h"  // for Context
 
-#if !defined(XGBOOST_USE_CUDA)
-
 #include "common.h"  // for AssertGPUSupport
 
-#endif  // !defined(XGBOOST_USE_CUDA)
-
 namespace xgboost::common {
 #if defined(XGBOOST_USE_CUDA)
 namespace cuda_impl {
@@ -21,6 +17,12 @@ double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights);
 }
 #endif
 
+#if defined(XGBOOST_USE_SYCL)
+namespace sycl_impl {
+double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights);
+}
+#endif
+
 [[nodiscard]] double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights,
                                         bst_idx_t n_samples) {
   if (weights.Empty()) {
@@ -31,6 +33,13 @@ double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights);
     return cuda_impl::SumOptionalWeights(ctx, weights);
 #else
     common::AssertGPUSupport();
+#endif
+  }
+  if (ctx->IsSycl()) {
+#if defined(XGBOOST_USE_SYCL)
+    return sycl_impl::SumOptionalWeights(ctx, weights);
+#else
+    common::AssertSYCLSupport();
 #endif
   }
   auto sum_weight = std::accumulate(weights.Data(), weights.Data() + weights.Size(), 0.0);
diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h
index a42f79fc171d..6a4eb7d1df7e 100644
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -27,12 +27,12 @@ struct OptionalWeights {
   [[nodiscard]] auto Data() const { return weights.data(); }
 };
 
-inline OptionalWeights MakeOptionalWeights(Context const* ctx,
+inline OptionalWeights MakeOptionalWeights(DeviceOrd device,
                                            HostDeviceVector<float> const& weights) {
-  if (ctx->IsCUDA()) {
-    weights.SetDevice(ctx->Device());
+  if (!device.IsCPU()) {
+    weights.SetDevice(device);
   }
-  return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()};
+  return OptionalWeights{device.IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
 }
 
 [[nodiscard]] double SumOptionalWeights(Context const* ctx, OptionalWeights const& weights,
diff --git a/src/common/ranking_utils.cc b/src/common/ranking_utils.cc
index 65793a13a10e..d477225a4efe 100644
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@@ -36,7 +36,8 @@ void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
 
   double sum_weights = 0;
   auto n_groups = Groups();
-  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  auto device = ctx->Device().IsSycl() ? DeviceOrd::CPU() : ctx->Device();
+  auto weight = common::MakeOptionalWeights(device, info.weights_);
   for (bst_omp_uint k = 0; k < n_groups; ++k) {
     sum_weights += weight[k];
   }
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index 3aa1a2c54762..590dd93a321b 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -171,7 +171,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   sorted_idx_cache_.SetDevice(ctx->Device());
   sorted_idx_cache_.Resize(info.labels.Size(), 0);
 
-  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  auto weight = common::MakeOptionalWeights(ctx->Device(), info.weights_);
   auto w_it =
       dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
   weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
diff --git a/src/learner.cc b/src/learner.cc
index a02691f160ce..1424ac471e18 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -256,7 +256,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
   std::swap(base_score_, base_score);
   // Make sure read access everywhere for thread-safe prediction.
   std::as_const(base_score_).HostView();
-  if (ctx->IsCUDA()) {
+  if (!ctx->IsCPU()) {
     std::as_const(base_score_).View(ctx->Device());
   }
   CHECK(std::as_const(base_score_).Data()->HostCanRead());
@@ -265,7 +265,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
 linalg::VectorView<float const> LearnerModelParam::BaseScore(DeviceOrd device) const {
   // multi-class is not yet supported.
   CHECK_GE(base_score_.Size(), 1) << ModelNotFitted();
-  if (!device.IsCUDA()) {
+  if (device.IsCPU()) {
     // Make sure that we won't run into race condition.
     CHECK(base_score_.Data()->HostCanRead());
     return base_score_.HostView();
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index a7efc8e70936..d8f69513c24b 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -325,7 +325,7 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
     auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
     auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
 
-    auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto weight = common::MakeOptionalWeights(ctx_->Device(), info.weights_);
     auto pre = p_cache->Pre(ctx_);
 
     common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
@@ -389,7 +389,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
 
     auto h_label = info.labels.HostView();
     auto h_predt = linalg::MakeTensorView(ctx_, &preds, preds.Size());
-    auto weights = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto weights = common::MakeOptionalWeights(ctx_->Device(), info.weights_);
 
     common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
       auto g_predt = h_predt.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
@@ -465,7 +465,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
     });
 
     auto sw = 0.0;
-    auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto weight = common::MakeOptionalWeights(ctx_->Device(), info.weights_);
     if (!weight.Empty()) {
       CHECK_EQ(weight.weights.size(), p_cache->Groups());
     }
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index e1f9a6a73be5..b3e41a5a5b53 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -38,7 +38,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
   predt.SetDevice(ctx->Device());
   auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
   auto topk = p_cache->Param().TopK();
-  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+  auto d_weight = common::MakeOptionalWeights(ctx->Device(), info.weights_);
 
   auto it = dh::MakeTransformIterator<double>(
       thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
@@ -86,7 +86,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
   CHECK(p_cache);
 
   auto const &p = p_cache->Param();
-  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+  auto d_weight = common::MakeOptionalWeights(ctx->Device(), info.weights_);
   if (!d_weight.Empty()) {
     CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
   }
@@ -178,7 +178,7 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
 
   PackedReduceResult result{0.0, 0.0};
   {
-    auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+    auto d_weight = common::MakeOptionalWeights(ctx->Device(), info.weights_);
     if (!d_weight.Empty()) {
       CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
     }
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index cd53089b958f..c1857bf73d46 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -359,16 +359,17 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
       return;
     }
 
+    auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
     bst_group_t n_groups = p_cache_->Groups();
     auto gptr = p_cache_->DataGroupPtr(ctx_);
 
-    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->SetDevice(device);
     out_gpair->Reshape(info.num_row_, 1);
 
     auto h_gpair = out_gpair->HostView();
     auto h_predt = predt.ConstHostSpan();
     auto h_label = info.labels.HostView();
-    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto h_weight = common::MakeOptionalWeights(device, info.weights_);
     auto make_range = [&](bst_group_t g) {
       return linalg::Range(gptr[g], gptr[g + 1]);
     };
@@ -486,14 +487,15 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
     bst_group_t n_groups = p_cache_->Groups();
 
     CHECK_EQ(info.labels.Shape(1), 1) << "multi-target for learning to rank is not yet supported.";
-    out_gpair->SetDevice(ctx_->Device());
+    auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
+    out_gpair->SetDevice(device);
     out_gpair->Reshape(info.num_row_, this->Targets(info));
 
     auto h_gpair = out_gpair->HostView();
     auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
     auto h_predt = predt.ConstHostSpan();
     auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
-    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto h_weight = common::MakeOptionalWeights(device, info.weights_);
 
     auto make_range = [&](bst_group_t g) {
       return linalg::Range(gptr[g], gptr[g + 1]);
@@ -590,7 +592,7 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
     auto h_gpair = out_gpair->HostView();
     auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
     auto h_predt = predt.ConstHostSpan();
-    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto h_weight = common::MakeOptionalWeights(ctx_->Device(), info.weights_);
 
     auto make_range = [&](bst_group_t g) {
       return linalg::Range(gptr[g], gptr[g + 1]);
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index f48d4a06eb81..64cca2fdfafe 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -268,7 +268,7 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
   /**
    * Lastly, normalization and weight.
    */
-  auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
+  auto d_weights = common::MakeOptionalWeights(ctx->Device(), info.weights_);
   auto w_norm = p_cache->WeightNorm();
   auto need_norm = p_cache->Param().lambdarank_normalization;
   auto n_pairs = p_cache->Param().NumPair();
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 86bf603f2618..5e3622ac0202 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -56,6 +56,14 @@ void ValidateLabel(Context const* ctx, MetaInfo const& info, std::int64_t n_clas
         common::AssertGPUSupport();
         return false;
 #endif  // defined(XGBOOST_USE_CUDA)
+      },
+      [&] {
+#if defined(XGBOOST_USE_SYCL)
+        return sycl::linalg::Validate(ctx->Device(), label, check);
+#else
+        common::AssertSYCLSupport();
+        return false;
+#endif  // defined(XGBOOST_USE_SYCL)
       });
   CHECK(valid)
       << "SoftmaxMultiClassObj: label must be discrete values in the range of [0, num_class).";
@@ -89,23 +97,23 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const auto n_samples = preds.Size() / n_classes;
     CHECK_EQ(n_samples, info.num_row_);
 
-    auto device = ctx_->Device();
-    auto labels = info.labels.View(ctx_->Device());
+    // fallback to cpu if current device doesn't supports fp64
+    auto device = ctx_->DeviceFP64();
+    auto labels = info.labels.View(device);
 
-    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->SetDevice(device);
     out_gpair->Reshape(info.num_row_, n_classes);
-    auto gpair = out_gpair->View(ctx_->Device());
+    auto gpair = out_gpair->View(device);
 
     if (!info.weights_.Empty()) {
       CHECK_EQ(info.weights_.Size(), n_samples)
           << "Number of weights should be equal to number of data points.";
     }
     info.weights_.SetDevice(device);
-    auto weights = common::MakeOptionalWeights(this->ctx_, info.weights_);
+    auto weights = common::MakeOptionalWeights(this->ctx_->Device(), info.weights_);
 
     preds.SetDevice(device);
     auto predt = linalg::MakeTensorView(this->ctx_, &preds, n_samples, n_classes);
-
     CHECK_EQ(labels.Shape(1), 1);
     auto y1d = labels.Slice(linalg::All(), 0);
     CHECK_EQ(y1d.Shape(0), info.num_row_);
@@ -196,7 +204,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     std::size_t n = info.labels.Size();
 
     auto labels = info.labels.View(ctx_->Device());
-    auto weights = common::MakeOptionalWeights(this->ctx_, info.weights_);
+    auto weights = common::MakeOptionalWeights(this->ctx_->Device(), info.weights_);
     auto intercept = base_score->View(ctx_->Device());
     CHECK_EQ(intercept.Size(), n_classes);
     CHECK_EQ(n, info.num_row_);
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index eb60c98d90c3..aa071c19cbc0 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -107,6 +107,14 @@ void ProbToMarginImpl(Context const* ctx, linalg::Vector<float>* base_score, Fn&
         common::AssertGPUSupport();
         return false;
 #endif  // defined(XGBOOST_USE_CUDA)
+      },
+      [&] {
+#if defined(XGBOOST_USE_SYCL)
+        return sycl::linalg::Validate(ctx->Device(), intercept, check);
+#else
+        common::AssertSYCLSupport();
+        return false;
+#endif  // defined(XGBOOST_USE_SYCL)
       });
   CHECK(is_valid) << error();
   linalg::ElementWiseKernel(ctx, intercept, [=] XGBOOST_DEVICE(std::size_t i) mutable {
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 31aa04730a72..592fb3e02069 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -48,11 +48,16 @@ void InitOutPredictions(Context const* ctx, linalg::VectorView<float const> base
                         linalg::MatrixView<float> predt);
 }
 
+namespace sycl_impl {
+void InitOutPredictions(Context const* ctx, linalg::VectorView<float const> base_score,
+                        linalg::MatrixView<float> predt);
+}
+
 void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<float>* out_preds,
                                    gbm::GBTreeModel const& model) const {
   CHECK_NE(model.learner_model_param->num_output_group, 0);
 
-  if (ctx_->Device().IsCUDA()) {
+  if (!ctx_->Device().IsCPU()) {
     out_preds->SetDevice(ctx_->Device());
   }
 
@@ -85,6 +90,12 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<float>
     cuda_impl::InitOutPredictions(this->ctx_, base_score, predt);
 #else
     common::AssertGPUSupport();
+#endif
+  } else if (this->ctx_->IsSycl()) {
+#if defined(XGBOOST_USE_SYCL)
+    sycl_impl::InitOutPredictions(this->ctx_, base_score, predt);
+#else
+    common::AssertSYCLSupport();
 #endif
   } else {
     common::ParallelFor(info.num_row_, this->ctx_->Threads(), [&](auto i) {
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index 9e7d9690ed8b..6ec19f41fc82 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -138,7 +138,7 @@ TEST(Linalg, SmallHistogram) {
       linalg::MakeTensorView(&ctx, dh::ToSpan(values), values.size(), 1);
   dh::CachingDeviceUVector<float> bins(n_bins);
   HostDeviceVector<float> weights;
-  SmallHistogram(&ctx, indices, common::MakeOptionalWeights(&ctx, weights),
+  SmallHistogram(&ctx, indices, common::MakeOptionalWeights(ctx.Device(), weights),
                  linalg::MakeTensorView(&ctx, dh::ToSpan(bins), bins.size()));
 
   std::vector<float> h_bins(n_bins);
diff --git a/tests/cpp/common/test_optional_weight.cc b/tests/cpp/common/test_optional_weight.cc
index e2c59e608f43..0e0b9c527913 100644
--- a/tests/cpp/common/test_optional_weight.cc
+++ b/tests/cpp/common/test_optional_weight.cc
@@ -11,12 +11,12 @@ namespace common {
 TEST(OptionalWeight, Basic) {
   HostDeviceVector<float> weight{{2.0f, 3.0f, 4.0f}};
   Context ctx;
-  auto opt_w = MakeOptionalWeights(&ctx, weight);
+  auto opt_w = MakeOptionalWeights(ctx.Device(), weight);
   ASSERT_EQ(opt_w[0], 2.0f);
   ASSERT_FALSE(opt_w.Empty());
 
   weight.HostVector().clear();
-  opt_w = MakeOptionalWeights(&ctx, weight);
+  opt_w = MakeOptionalWeights(ctx.Device(), weight);
   ASSERT_EQ(opt_w[0], 1.0f);
   ASSERT_TRUE(opt_w.Empty());
 }
diff --git a/tests/cpp/plugin/test_sycl_linalg.cc b/tests/cpp/plugin/test_sycl_linalg.cc
new file mode 100644
index 000000000000..2827aa34fbb3
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_linalg.cc
@@ -0,0 +1,47 @@
+/*!
+ * Copyright 2017-2025 XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+
+#include "../../src/common/linalg_op.h"
+#include "../../../src/common/optional_weight.h"  // for MakeOptionalWeights
+#include "sycl_helpers.h"
+
+namespace xgboost::sycl::linalg {
+TEST(SyclLinalg, SmallHistogram) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  std::size_t cnt = 32, n_bins = 4;
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  HostDeviceVector<float> values(cnt * n_bins);
+  values.SetDevice(ctx.Device());
+  float* values_host_ptr = values.HostPointer();
+  for (std::size_t i = 0; i < n_bins; ++i) {
+    std::fill(values_host_ptr + i * cnt, values_host_ptr + (i  + 1) * cnt, i);
+  }
+
+  std::mt19937 rng;
+  rng.seed(2025);
+  std::shuffle(values_host_ptr, values_host_ptr + cnt * n_bins, rng);
+
+  float* values_device_ptr = values.DevicePointer();
+  xgboost::linalg::MatrixView<float> indices =
+      xgboost::linalg::MakeTensorView(&ctx, xgboost::common::Span(values_device_ptr, cnt * n_bins),
+                                      cnt * n_bins, 1);
+  HostDeviceVector<float> bins(n_bins, 0);
+  bins.SetDevice(ctx.Device());
+
+  HostDeviceVector<float> weights;
+  xgboost::linalg::SmallHistogram(&ctx, indices, xgboost::common::MakeOptionalWeights(ctx.Device(), weights),
+                 xgboost::linalg::MakeTensorView(&ctx, xgboost::common::Span(bins.DevicePointer(), n_bins), n_bins));
+
+  for (std::size_t i = 0; i < n_bins; ++i) {
+    ASSERT_EQ(bins.HostVector()[i], cnt);
+  }
+}
+}  // namespace xgboost::linalg
\ No newline at end of file
diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc
index a881e679f29b..04df03e29bc4 100755
--- a/tests/cpp/plugin/test_sycl_predictor.cc
+++ b/tests/cpp/plugin/test_sycl_predictor.cc
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2023 XGBoost contributors
+ * Copyright 2017-2025 XGBoost contributors
  */
 #include <gtest/gtest.h>
 #pragma GCC diagnostic push
@@ -101,10 +101,4 @@ TEST(SyclPredictor, Sparse) {
   TestSparsePrediction(&ctx, 0.8);
 }
 
-TEST(SyclPredictor, Multi) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
-  TestVectorLeafPrediction(&ctx);
-}
-
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 6a822e910375..766f325d777e 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -33,7 +33,7 @@ void TestBasic(DMatrix* dmat, Context const *ctx) {
   size_t const kRows = dmat->Info().num_row_;
   size_t const kCols = dmat->Info().num_col_;
 
-  LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
+  LearnerModelParam mparam{MakeMP(kCols, .0, 1, ctx->Device())};
 
   gbm::GBTreeModel model = CreateTestModel(&mparam, ctx);
 
@@ -127,7 +127,7 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
                           {"num_feature", std::to_string(kCols)},
                           {"num_class", std::to_string(kClasses)},
                           {"max_bin", std::to_string(bins)},
-                          {"device", ctx->IsSycl() ? "cpu" : ctx->DeviceName()}});
+                          {"device", ctx->DeviceName()}});
   learner->Configure();
 
   for (size_t i = 0; i < kIters; ++i) {
@@ -622,8 +622,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
   learner->LoadModel(model);
   learner->SetParam("device", ctx->DeviceName());
   learner->Configure();
-
-  if (ctx->IsCUDA()) {
+  if (!ctx->IsCPU()) {
     learner->SetParam("tree_method", "hist");
     learner->SetParam("device", ctx->Device().Name());
   }

From 515efec22d00994bd91cf9087032e6ccd51d4d66 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Thu, 9 Oct 2025 20:52:56 +0200
Subject: [PATCH 189/224] [R] Correct docs about attributes of cv object
 (#11732)

---
 R-package/R/xgb.cv.R    | 11 +++++++----
 R-package/man/xgb.cv.Rd | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 88217a04d35d..50b13afa93ea 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -76,8 +76,6 @@
 #'   - `nfeatures`: Number of features in training data.
 #'   - `folds`: The list of CV folds' indices - either those passed through the `folds`
 #'      parameter or randomly generated.
-#'   - `best_iteration`: Iteration number with the best evaluation metric value
-#'      (only available with early stopping).
 #'
 #'   Plus other potential elements that are the result of callbacks, such as a list `cv_predict` with
 #'   a sub-element `pred` when passing `prediction = TRUE`, which is added by the [xgb.cb.cv.predict()]
@@ -92,18 +90,23 @@
 #'
 #' cv <- xgb.cv(
 #'   data = dtrain,
-#'   nrounds = 3,
+#'   nrounds = 20,
+#'   early_stopping_rounds = 1,
 #'   params = xgb.params(
 #'     nthread = 2,
 #'     max_depth = 3,
 #'     objective = "binary:logistic"
 #'   ),
 #'   nfold = 5,
-#'   metrics = list("rmse","auc")
+#'   metrics = list("rmse","auc"),
+#'   prediction = TRUE
 #' )
 #' print(cv)
 #' print(cv, verbose = TRUE)
 #'
+#' # Callbacks might add additional attributes, separated by the name of the callback
+#' cv$early_stop$best_iteration
+#' head(cv$cv_predict$pred)
 #' @export
 xgb.cv <- function(params = xgb.params(), data, nrounds, nfold,
                    prediction = FALSE, showsd = TRUE, metrics = list(),
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index e1bf60116ca6..e857a9b7420e 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -170,8 +170,6 @@ It is created by the \code{\link[=xgb.cb.evaluation.log]{xgb.cb.evaluation.log()
 \item \code{nfeatures}: Number of features in training data.
 \item \code{folds}: The list of CV folds' indices - either those passed through the \code{folds}
 parameter or randomly generated.
-\item \code{best_iteration}: Iteration number with the best evaluation metric value
-(only available with early stopping).
 }
 
 Plus other potential elements that are the result of callbacks, such as a list \code{cv_predict} with
@@ -203,16 +201,21 @@ dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 
 cv <- xgb.cv(
   data = dtrain,
-  nrounds = 3,
+  nrounds = 20,
+  early_stopping_rounds = 1,
   params = xgb.params(
     nthread = 2,
     max_depth = 3,
     objective = "binary:logistic"
   ),
   nfold = 5,
-  metrics = list("rmse","auc")
+  metrics = list("rmse","auc"),
+  prediction = TRUE
 )
 print(cv)
 print(cv, verbose = TRUE)
 
+# Callbacks might add additional attributes, separated by the name of the callback
+cv$early_stop$best_iteration
+head(cv$cv_predict$pred)
 }

From 79ced220d62c4ce0d305cb292e7332bcbbd87213 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Oct 2025 14:16:55 -0700
Subject: [PATCH 190/224] Fix sponsor logo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f413cf3c97f7..ac34880b3064 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Become a sponsor and get a logo here. See details at [Sponsoring the XGBoost Pro
 
 <a href="https://www.nvidia.com/en-us/" target="_blank"><img src="https://raw.githubusercontent.com/xgboost-ai/xgboost-ai.github.io/master/images/sponsors/nvidia.jpg" alt="NVIDIA" width="72" height="72"></a>
 <a href="https://www.comet.com/site/?utm_source=xgboost&utm_medium=github&utm_content=readme" target="_blank"><img src="https://cdn.comet.ml/img/notebook_logo.png" height="72"></a>
-<a href="https://opencollective.com/guest-f5ebfc79" target="_blank"><img src="https://images.opencollective.com/guest-f5ebfc79/avatar/256.png" height="72"></a>
+<a href="https://opencollective.com/tomislav1" target="_blank"><img src="https://images.opencollective.com/tomislav1/avatar/256.png" height="72"></a>
 <a href="https://databento.com/?utm_source=xgboost&utm_medium=sponsor&utm_content=display"><img src="https://raw.githubusercontent.com/xgboost-ai/xgboost-ai.github.io/refs/heads/master/images/sponsors/databento.png" height="72"></a>
 
 ### Backers

From b58f701c2fb7216307edda807cd88af3a4946024 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 13 Oct 2025 17:18:18 +0800
Subject: [PATCH 191/224] Implement a tree view for the scalar tree. (#11741)

- Add scalar tree view.
- Replace the GPU-specific tree view.
- Replace the walk tree routine.
- Replace the multi-target-specific prediction function.
---
 R-package/src/Makevars.in                     |   1 +
 R-package/src/Makevars.win.in                 |   1 +
 include/xgboost/base.h                        |   2 +-
 include/xgboost/multi_target_tree_model.h     |  75 ++-----
 include/xgboost/tree_model.h                  | 125 +++--------
 src/gbm/gbtree.cc                             |   2 +-
 src/gbm/gbtree.h                              |  23 +-
 src/gbm/gbtree_model.cc                       |   8 +
 src/gbm/gbtree_model.h                        |  16 +-
 src/objective/adaptive.cc                     |   5 +-
 src/objective/adaptive.cu                     |   9 +-
 src/predictor/array_tree_layout.h             |  32 +--
 src/predictor/cpu_predictor.cc                |  31 ++-
 src/predictor/gpu_predictor.cu                | 196 ++++++++--------
 src/predictor/predict_fn.h                    |  32 +--
 src/predictor/treeshap.cc                     |   4 +-
 src/tree/multi_target_tree_model.cc           |  40 +---
 src/tree/tree_model.cc                        |  72 ++++--
 src/tree/tree_view.cc                         |  65 ++++++
 src/tree/tree_view.h                          | 211 ++++++++++++++++++
 src/tree/updater_refresh.cc                   |  28 +--
 tests/cpp/predictor/test_cpu_predictor.cc     |  29 ++-
 .../cpp/tree/test_multi_target_tree_model.cc  |  27 +--
 tests/cpp/tree/test_tree_model.cc             |   2 +
 tests/cpp/tree/test_tree_policy.cc            |  10 +-
 tests/cpp/tree/test_tree_stat.cc              |  14 +-
 26 files changed, 612 insertions(+), 448 deletions(-)
 create mode 100644 src/tree/tree_view.cc
 create mode 100644 src/tree/tree_view.h

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index eb0b96bafeb2..07fb0c9d5153 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -90,6 +90,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/param.o \
     $(PKGROOT)/src/tree/fit_stump.o \
     $(PKGROOT)/src/tree/tree_model.o \
+    $(PKGROOT)/src/tree/tree_view.o \
     $(PKGROOT)/src/tree/tree_updater.o \
     $(PKGROOT)/src/tree/multi_target_tree_model.o \
     $(PKGROOT)/src/tree/updater_approx.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 3d2fe1fcee77..471a0deae9b2 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -89,6 +89,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/param.o \
     $(PKGROOT)/src/tree/fit_stump.o \
     $(PKGROOT)/src/tree/tree_model.o \
+    $(PKGROOT)/src/tree/tree_view.o \
     $(PKGROOT)/src/tree/multi_target_tree_model.o \
     $(PKGROOT)/src/tree/tree_updater.o \
     $(PKGROOT)/src/tree/updater_approx.o \
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index cc51193e56f0..7bfbc1300b94 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -114,7 +114,7 @@ using bst_bin_t = std::int32_t;  // NOLINT
  */
 using bst_idx_t = std::uint64_t;  // NOLINT
 /**
- * \brief Type for tree node index.
+ * \brief Type for tree node index and tree depth.
  */
 using bst_node_t = std::int32_t;      // NOLINT
 /**
diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index c44b43e4a3c8..76b79f4b4313 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -15,60 +15,21 @@
 
 #include <cstddef>  // for size_t
 #include <cstdint>  // for uint8_t
-#include <mutex>    // for mutex
 #include <vector>   // for vector
 
 namespace xgboost {
+namespace tree {
+struct MultiTargetTreeView;
+}
 struct TreeParam;
-/**
- * @brief A view to the @MultiTargetTree suitable for both host and device.
- */
-struct MultiTargetTreeView {
-  static bst_node_t constexpr InvalidNodeId() { return -1; }
-
-  bst_node_t const* left;
-  bst_node_t const* right;
-  bst_node_t const* parent;
-
-  bst_feature_t const* split_index;
-  std::uint8_t const* default_left;
-  float const* split_conds;
-
-  // The number of nodes
-  std::size_t n{0};
-
-  linalg::MatrixView<float const> weights;
-
-  [[nodiscard]] XGBOOST_DEVICE bool IsLeaf(bst_node_t nidx) const {
-    return left[nidx] == InvalidNodeId();
-  }
-
-  [[nodiscard]] XGBOOST_DEVICE bst_node_t LeftChild(bst_node_t nidx) const { return left[nidx]; }
-  [[nodiscard]] XGBOOST_DEVICE bst_node_t RightChild(bst_node_t nidx) const { return right[nidx]; }
-  [[nodiscard]] XGBOOST_DEVICE bst_feature_t SplitIndex(bst_node_t nidx) const {
-    return split_index[nidx];
-  }
-  [[nodiscard]] XGBOOST_DEVICE float SplitCond(bst_node_t nidx) const { return split_conds[nidx]; }
-  [[nodiscard]] XGBOOST_DEVICE bool DefaultLeft(bst_node_t nidx) const {
-    return default_left[nidx];
-  }
-  [[nodiscard]] XGBOOST_DEVICE bst_node_t DefaultChild(bst_node_t nidx) const {
-    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
-  }
-  [[nodiscard]] XGBOOST_DEVICE linalg::VectorView<float const> LeafValue(bst_node_t nidx) const {
-    return this->weights.Slice(nidx, linalg::All());
-  }
-
-  [[nodiscard]] bst_target_t NumTargets() const { return this->weights.Shape(1); }
-  [[nodiscard]] bst_node_t Size() const { return this->n; }
-};
 
 /**
  * @brief Tree structure for multi-target model.
  */
 class MultiTargetTree : public Model {
  public:
-  static bst_node_t constexpr InvalidNodeId() { return MultiTargetTreeView::InvalidNodeId(); }
+  static bst_node_t constexpr InvalidNodeId() { return -1; }
+  friend struct tree::MultiTargetTreeView;
 
  private:
   TreeParam const* param_;
@@ -80,8 +41,6 @@ class MultiTargetTree : public Model {
   HostDeviceVector<float> split_conds_;
   HostDeviceVector<float> weights_;
 
-  mutable std::mutex tree_view_lock_;
-
   [[nodiscard]] linalg::VectorView<float const> NodeWeight(bst_node_t nidx) const {
     auto beg = nidx * this->NumTargets();
     auto v = this->weights_.ConstHostSpan().subspan(beg, this->NumTargets());
@@ -140,27 +99,27 @@ class MultiTargetTree : public Model {
 
   [[nodiscard]] bst_target_t NumTargets() const;
 
+  [[nodiscard]] auto NumLeaves() const { return this->weights_.Size() / this->NumTargets(); }
+
+  [[nodiscard]] bool IsLeftChild(bst_node_t nidx) const {
+    auto p = this->Parent(nidx);
+    return nidx == this->LeftChild(p);
+  }
   [[nodiscard]] std::size_t Size() const;
+  [[nodiscard]] MultiTargetTree* Copy(TreeParam const* param) const;
 
-  [[nodiscard]] bst_node_t Depth(bst_node_t nidx) const {
-    bst_node_t depth{0};
-    while (Parent(nidx) != InvalidNodeId()) {
-      ++depth;
-      nidx = Parent(nidx);
+  common::Span<float const> Weights(DeviceOrd device) const {
+    if (device.IsCPU()) {
+      return this->weights_.ConstHostSpan();
     }
-    return depth;
+    this->weights_.SetDevice(device);
+    return this->weights_.ConstDeviceSpan();
   }
 
   [[nodiscard]] linalg::VectorView<float const> LeafValue(bst_node_t nidx) const {
     CHECK(IsLeaf(nidx));
     return this->NodeWeight(nidx);
   }
-  /**
-   * @brief Get a view to the tree.
-   *
-   *   This method is NOT thread-safe.
-   */
-  [[nodiscard]] MultiTargetTreeView View(Context const* ctx) const;
 
   void LoadModel(Json const& in) override;
   void SaveModel(Json* out) const override;
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 7744cb32b340..65b00ef12040 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -1,7 +1,7 @@
 /**
  * Copyright 2014-2025, XGBoost Contributors
  *
- * \brief model structure for tree
+ * @brief model structure for tree
  * \author Tianqi Chen
  */
 #ifndef XGBOOST_TREE_MODEL_H_
@@ -10,20 +10,27 @@
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/feature_map.h>
-#include <xgboost/linalg.h>  // for VectorView
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/linalg.h>              // for VectorView
 #include <xgboost/logging.h>
 #include <xgboost/model.h>
 #include <xgboost/multi_target_tree_model.h>  // for MultiTargetTree
 
 #include <algorithm>
 #include <cstring>
-#include <limits>
-#include <memory>  // for make_unique
-#include <stack>
+#include <limits>  // for numeric_limits
+#include <memory>  // for unique_ptr
 #include <string>
+#include <type_traits>  // for is_signed_v
 #include <vector>
 
 namespace xgboost {
+
+namespace tree {
+struct ScalarTreeView;
+struct MultiTargetTreeView;
+}
+
 class Json;
 
 /** @brief meta parameters of the tree */
@@ -46,67 +53,39 @@ struct TreeParam {
   void ToJson(Json* p_out) const;
 };
 
-/*! \brief node statistics used in regression tree */
+/** @brief node statistics used in regression tree */
 struct RTreeNodeStat {
-  /*! \brief loss change caused by current split */
-  bst_float loss_chg;
-  /*! \brief sum of hessian values, used to measure coverage of data */
-  bst_float sum_hess;
-  /*! \brief weight of current node */
-  bst_float base_weight;
-  /*! \brief number of child that is leaf node known up to now */
-  int leaf_child_cnt {0};
+  /** @brief loss change caused by current split */
+  float loss_chg;
+  /** @brief sum of hessian values, used to measure coverage of data */
+  float sum_hess;
+  /** @brief weight of current node */
+  float base_weight;
+  /** @brief number of child that is leaf node known up to now */
+  int leaf_child_cnt{0};
 
   RTreeNodeStat() = default;
-  RTreeNodeStat(float loss_chg, float sum_hess, float weight) :
-      loss_chg{loss_chg}, sum_hess{sum_hess}, base_weight{weight} {}
+  RTreeNodeStat(float loss_chg, float sum_hess, float weight)
+      : loss_chg{loss_chg}, sum_hess{sum_hess}, base_weight{weight} {}
   bool operator==(const RTreeNodeStat& b) const {
-    return loss_chg == b.loss_chg && sum_hess == b.sum_hess &&
-           base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt;
+    return loss_chg == b.loss_chg && sum_hess == b.sum_hess && base_weight == b.base_weight &&
+           leaf_child_cnt == b.leaf_child_cnt;
   }
 };
 
 /**
- * \brief Helper for defining copyable data structure that contains unique pointers.
- */
-template <typename T>
-class CopyUniquePtr {
-  std::unique_ptr<T> ptr_{nullptr};
-
- public:
-  CopyUniquePtr() = default;
-  CopyUniquePtr(CopyUniquePtr const& that) {
-    ptr_.reset(nullptr);
-    if (that.ptr_) {
-      ptr_ = std::make_unique<T>(*that);
-    }
-  }
-  T* get() const noexcept { return ptr_.get(); }  // NOLINT
-
-  T& operator*() { return *ptr_; }
-  T* operator->() noexcept { return this->get(); }
-
-  T const& operator*() const { return *ptr_; }
-  T const* operator->() const noexcept { return this->get(); }
-
-  explicit operator bool() const { return static_cast<bool>(ptr_); }
-  bool operator!() const { return !ptr_; }
-  void reset(T* ptr) { ptr_.reset(ptr); }  // NOLINT
-};
-
-/**
- * \brief define regression tree to be the most common tree model.
+ * @brief define regression tree to be the most common tree model.
  *
  *  This is the data structure used in xgboost's major tree models.
  */
 class RegTree : public Model {
  public:
-  using SplitCondT = bst_float;
+  using SplitCondT = float;
   static constexpr bst_node_t kInvalidNodeId{MultiTargetTree::InvalidNodeId()};
   static constexpr uint32_t kDeletedNodeMarker = std::numeric_limits<uint32_t>::max();
   static constexpr bst_node_t kRoot{0};
 
-  /*! \brief tree node */
+  /** @brief tree node */
   class Node {
    public:
     XGBOOST_DEVICE Node()  {
@@ -305,31 +284,6 @@ class RegTree : public Model {
     return nodes_ == b.nodes_ && stats_ == b.stats_ &&
            deleted_nodes_ == b.deleted_nodes_ && param_ == b.param_;
   }
-  /* \brief Iterate through all nodes in this tree.
-   *
-   * \param Function that accepts a node index, and returns false when iteration should
-   *        stop, otherwise returns true.
-   */
-  template <typename Func> void WalkTree(Func func) const {
-    std::stack<bst_node_t> nodes;
-    nodes.push(kRoot);
-    auto &self = *this;
-    while (!nodes.empty()) {
-      auto nidx = nodes.top();
-      nodes.pop();
-      if (!func(nidx)) {
-        return;
-      }
-      auto left = self.LeftChild(nidx);
-      auto right = self.RightChild(nidx);
-      if (left != RegTree::kInvalidNodeId) {
-        nodes.push(left);
-      }
-      if (right != RegTree::kInvalidNodeId) {
-        nodes.push(right);
-      }
-    }
-  }
   /*!
    * \brief Compares whether 2 trees are equal from a user's perspective.  The equality
    *        compares only non-deleted nodes.
@@ -432,21 +386,10 @@ class RegTree : public Model {
   [[nodiscard]] bst_node_t GetNumLeaves() const;
   [[nodiscard]] bst_node_t GetNumSplitNodes() const;
 
-  /*!
-   * \brief get current depth
-   * \param nid node id
+  /**
+   * @brief Get the depth of a node.
    */
-  [[nodiscard]] std::int32_t GetDepth(bst_node_t nid) const {
-    if (IsMultiTarget()) {
-      return this->p_mt_tree_->Depth(nid);
-    }
-    int depth = 0;
-    while (!nodes_[nid].IsRoot()) {
-      ++depth;
-      nid = nodes_[nid].Parent();
-    }
-    return depth;
-  }
+  [[nodiscard]] bst_node_t GetDepth(bst_node_t nidx) const;
   /**
    * \brief Set the leaf weight for a multi-target tree.
    */
@@ -649,6 +592,10 @@ class RegTree : public Model {
     return this->nodes_.size();
   }
 
+  [[nodiscard]] RegTree* Copy() const;
+  tree::ScalarTreeView HostScView() const;
+  tree::MultiTargetTreeView HostMtView() const;
+
  private:
   template <bool typed>
   void LoadCategoricalSplit(Json const& in);
@@ -668,7 +615,7 @@ class RegTree : public Model {
   // Ptr to split categories of each node.
   std::vector<CategoricalSplitMatrix::Segment> split_categories_segments_;
   // ptr to multi-target tree with vector leaf.
-  CopyUniquePtr<MultiTargetTree> p_mt_tree_;
+  std::unique_ptr<MultiTargetTree> p_mt_tree_;
   // allocate a new node,
   // !!!!!! NOTE: may cause BUG here, nodes.resize
   bst_node_t AllocNode() {
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 2047ad8f696a..a92dd42906d9 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -459,7 +459,7 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
 
   *out_of_bound =
       detail::SliceTrees(begin, end, step, this->model_, [&](auto in_tree_idx, auto out_l) {
-        auto new_tree = std::make_unique<RegTree>(*this->model_.trees.at(in_tree_idx));
+        std::unique_ptr<RegTree> new_tree{this->model_.trees.at(in_tree_idx)->Copy()};
         out_trees.emplace_back(std::move(new_tree));
 
         bst_group_t group = this->model_.tree_info[in_tree_idx];
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 658e5d91e3ed..ab11b4019874 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -18,7 +18,8 @@
 #include <vector>
 
 #include "../common/timer.h"
-#include "../tree/param.h"  // TrainParam
+#include "../tree/param.h"      // TrainParam
+#include "../tree/tree_view.h"  // for WalkTree
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -231,7 +232,7 @@ class GBTree : public GradientBooster {
       for (auto idx : trees) {
         CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
         auto const& tree = *model_.trees[idx];
-        tree.WalkTree([&](bst_node_t nidx) {
+        tree::WalkTree(tree, [&](auto const& tree, bst_node_t nidx) {
           if (!tree.IsLeaf(nidx)) {
             split_counts[tree.SplitIndex(nidx)]++;
             fn(tree, nidx, tree.SplitIndex(nidx));
@@ -246,18 +247,20 @@ class GBTree : public GradientBooster {
         gain_map[split] = split_counts[split];
       });
     } else if (importance_type == "gain" || importance_type == "total_gain") {
-      if (!model_.trees.empty() && model_.trees.front()->IsMultiTarget()) {
-        LOG(FATAL) << "gain/total_gain " << MTNotImplemented();
-      }
       add_score([&](auto const& tree, bst_node_t nidx, bst_feature_t split) {
-        gain_map[split] += tree.Stat(nidx).loss_chg;
+        if constexpr (tree::IsScalarTree<decltype(tree)>()) {
+          gain_map[split] += tree.Stat(nidx).loss_chg;
+        } else {
+          LOG(FATAL) << "gain/total_gain " << MTNotImplemented();
+        }
       });
     } else if (importance_type == "cover" || importance_type == "total_cover") {
-      if (!model_.trees.empty() && model_.trees.front()->IsMultiTarget()) {
-        LOG(FATAL) << "cover/total_cover " << MTNotImplemented();
-      }
       add_score([&](auto const& tree, bst_node_t nidx, bst_feature_t split) {
-        gain_map[split] += tree.Stat(nidx).sum_hess;
+        if constexpr (tree::IsScalarTree<decltype(tree)>()) {
+          gain_map[split] += tree.Stat(nidx).sum_hess;
+        } else {
+          LOG(FATAL) << "cover/total_cover " << MTNotImplemented();
+        }
       });
     } else {
       LOG(FATAL)
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 59a81b870de5..2e3aa7f40af4 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -142,4 +142,12 @@ bst_tree_t GBTreeModel::CommitModel(TreesOneIter&& new_trees) {
   Validate(*this);
   return n_new_trees;
 }
+
+void GBTreeModel::CommitModelGroup(TreesOneGroup&& new_trees, bst_target_t group_idx) {
+  for (auto& new_tree : new_trees) {
+    trees.push_back(std::move(new_tree));
+    tree_info.push_back(group_idx);
+  }
+  param.num_trees += static_cast<int>(new_trees.size());
+}
 }  // namespace xgboost::gbm
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index ddff8a7479e4..3a39c6f097b9 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -67,7 +67,7 @@ struct GBTreeModel : public Model {
  public:
   explicit GBTreeModel(LearnerModelParam const* learner_model, Context const* ctx)
       : learner_model_param{learner_model}, ctx_{ctx} {}
-  void Configure(const Args& cfg) {
+  void Configure(Args const& cfg) {
     // initialize model parameters if not yet been initialized.
     if (trees.size() == 0) {
       param.UpdateAllowUnknown(cfg);
@@ -75,7 +75,7 @@ struct GBTreeModel : public Model {
   }
 
   void InitTreesToUpdate() {
-    if (trees_to_update.size() == 0u) {
+    if (trees_to_update.empty()) {
       for (auto& tree : trees) {
         trees_to_update.push_back(std::move(tree));
       }
@@ -99,19 +99,13 @@ struct GBTreeModel : public Model {
     return dump;
   }
   /**
-   * \brief Add trees to the model.
+   * @brief Add trees to the model.
    *
-   * \return The number of new trees.
+   * @return The number of new trees.
    */
   bst_tree_t CommitModel(TreesOneIter&& new_trees);
 
-  void CommitModelGroup(std::vector<std::unique_ptr<RegTree>>&& new_trees, bst_target_t group_idx) {
-    for (auto& new_tree : new_trees) {
-      trees.push_back(std::move(new_tree));
-      tree_info.push_back(group_idx);
-    }
-    param.num_trees += static_cast<int>(new_trees.size());
-  }
+  void CommitModelGroup(TreesOneGroup&& new_trees, bst_target_t group_idx);
 
   [[nodiscard]] std::int32_t BoostedRounds() const {
     if (trees.empty()) {
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index b38bbb98eb0d..8e4060bea7b7 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -15,6 +15,7 @@
 #include "../common/threading_utils.h"     // ParallelFor
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "../tree/sample_position.h"       // for SamplePosition
+#include "../tree/tree_view.h"             // for WalkTree
 #include "xgboost/base.h"                  // bst_node_t
 #include "xgboost/context.h"               // Context
 #include "xgboost/data.h"                  // MetaInfo
@@ -48,8 +49,8 @@ void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
   CHECK_LE(begin_pos, sorted_pos.size());
 
   std::vector<bst_node_t> leaf;
-  tree.WalkTree([&](bst_node_t nidx) {
-    if (tree[nidx].IsLeaf()) {
+  tree::WalkTree(tree, [&](auto const& tree, bst_node_t nidx) {
+    if (tree.IsLeaf(nidx)) {
       leaf.push_back(nidx);
     }
     return true;
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 056bb0db2bc8..4b404259d485 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -12,6 +12,7 @@
 #include "../common/device_helpers.cuh"
 #include "../common/stats.cuh"
 #include "../tree/sample_position.h"  // for SamplePosition
+#include "../tree/tree_view.h"        // for WalkTree
 #include "adaptive.h"
 #include "xgboost/context.h"
 
@@ -40,8 +41,8 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                    sorted_position.cbegin();
   if (beg_pos == sorted_position.size()) {
     auto& leaf = p_nidx->HostVector();
-    tree.WalkTree([&](bst_node_t nidx) {
-      if (tree[nidx].IsLeaf()) {
+    tree::WalkTree(tree, [&](auto const& tree, bst_node_t nidx) {
+      if (tree.IsLeaf(nidx)) {
         leaf.push_back(nidx);
       }
       return true;
@@ -122,8 +123,8 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
     nidx.Resize(*h_num_runs);
 
     std::vector<bst_node_t> leaves;
-    tree.WalkTree([&](bst_node_t nidx) {
-      if (tree[nidx].IsLeaf()) {
+    tree::WalkTree(tree, [&](auto const& tree, bst_node_t nidx) {
+      if (tree.IsLeaf(nidx)) {
         leaves.push_back(nidx);
       }
       return true;
diff --git a/src/predictor/array_tree_layout.h b/src/predictor/array_tree_layout.h
index 6b24c3c64556..3cbdd695019c 100644
--- a/src/predictor/array_tree_layout.h
+++ b/src/predictor/array_tree_layout.h
@@ -10,8 +10,10 @@
 #include <limits>
 #include <type_traits>  // for conditional_t
 
-#include "../common/categorical.h"  // for IsCat
-#include "xgboost/tree_model.h"     // for RegTree
+#include "../common/categorical.h"            // for IsCat
+#include "../tree/tree_view.h"                // for ScalarTreeView
+#include "xgboost/multi_target_tree_model.h"  // for MultiTargetTreeView
+#include "xgboost/tree_model.h"               // for RegTree
 
 namespace xgboost::predictor {
 
@@ -24,7 +26,7 @@ namespace xgboost::predictor {
  *
  * @tparam kNumDeepLevels number of tree leveles being unrolled into array-based structure
  */
-template <bool has_categorical, bool any_missing, int kNumDeepLevels>
+template <bool has_categorical, bool any_missing, int kNumDeepLevels, typename TreeView>
 class ArrayTreeLayout {
  private:
   /* Number of nodes in the array based representation of the top levels of the tree
@@ -66,7 +68,7 @@ class ArrayTreeLayout {
  * @param nidx node idx in the original tree
  */
   template <int depth = 0>
-  void Populate(const RegTree& tree, RegTree::CategoricalSplitMatrix const& cats,
+  void Populate(TreeView const& tree, RegTree::CategoricalSplitMatrix const& cats,
                 bst_node_t nidx_array = 0, bst_node_t nidx = 0) {
     if constexpr (depth == kNumDeepLevels + 1) {
       return;
@@ -139,7 +141,7 @@ class ArrayTreeLayout {
   constexpr static int kMaxNumDeepLevels = 6;
   static_assert(kNumDeepLevels <= kMaxNumDeepLevels);
 
-  ArrayTreeLayout(const RegTree& tree, RegTree::CategoricalSplitMatrix const &cats) {
+  ArrayTreeLayout(TreeView const& tree, RegTree::CategoricalSplitMatrix const &cats) {
     Populate(tree, cats);
   }
 
@@ -200,27 +202,27 @@ class ArrayTreeLayout {
   }
 };
 
-template <bool has_categorical, bool any_missing, int num_deep_levels = 1>
-void ProcessArrayTree(const RegTree& tree, RegTree::CategoricalSplitMatrix const& cats,
-                      common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
-                      bst_node_t* p_nidx, int tree_depth) {
+template <bool has_categorical, bool any_missing, int num_deep_levels = 1, typename TreeView>
+void ProcessArrayTree(TreeView const& tree, common::Span<RegTree::FVec> fvec_tloc,
+                      std::size_t const block_size, bst_node_t* p_nidx, bst_node_t tree_depth) {
   constexpr int kMaxNumDeepLevels =
-      ArrayTreeLayout<has_categorical, any_missing, 0>::kMaxNumDeepLevels;
+      ArrayTreeLayout<has_categorical, any_missing, 0, TreeView>::kMaxNumDeepLevels;
 
   // Fill the array tree, then output predicted node idx.
   if constexpr (num_deep_levels == kMaxNumDeepLevels) {
-    ArrayTreeLayout<has_categorical, any_missing, num_deep_levels> buffer(tree, cats);
+    ArrayTreeLayout<has_categorical, any_missing, num_deep_levels, TreeView> buffer{
+        tree, tree.GetCategoriesMatrix()};
     buffer.Process(fvec_tloc, block_size, p_nidx);
   } else {
     if (tree_depth <= num_deep_levels) {
-      ArrayTreeLayout<has_categorical, any_missing, num_deep_levels> buffer(tree, cats);
+      ArrayTreeLayout<has_categorical, any_missing, num_deep_levels, TreeView> buffer{
+          tree, tree.GetCategoriesMatrix()};
       buffer.Process(fvec_tloc, block_size, p_nidx);
     } else {
-      ProcessArrayTree<has_categorical, any_missing, num_deep_levels + 1>
-        (tree, cats, fvec_tloc, block_size, p_nidx, tree_depth);
+      ProcessArrayTree<has_categorical, any_missing, num_deep_levels + 1>(
+          tree, fvec_tloc, block_size, p_nidx, tree_depth);
     }
   }
 }
-
 }  // namespace xgboost::predictor
 #endif  // XGBOOST_PREDICTOR_ARRAY_TREE_LAYOUT_H_
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 97b873c0c130..479e428f7ac1 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -49,7 +49,7 @@ bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
     bst_feature_t split_index = tree[nidx].SplitIndex();
     auto fvalue = feat.GetFvalue(split_index);
     nidx = GetNextNode<has_missing, has_categorical>(
-        tree[nidx], nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
+        tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
   }
   return nidx;
 }
@@ -73,8 +73,7 @@ void PredValueByOneTree(const RegTree& tree,
                         bst_node_t* p_nidx, int depth, int gid) {
   auto const &cats = tree.GetCategoriesMatrix();
   if constexpr (use_array_tree_layout) {
-    ProcessArrayTree<has_categorical, any_missing>(tree, cats, fvec_tloc, block_size, p_nidx,
-                                                   depth);
+    ProcessArrayTree<has_categorical, any_missing>(tree, fvec_tloc, block_size, p_nidx, depth);
   }
   for (std::size_t i = 0; i < block_size; ++i) {
     bst_node_t nidx = 0;
@@ -94,20 +93,20 @@ void PredValueByOneTree(const RegTree& tree,
 
 namespace multi {
 template <bool has_missing, bool has_categorical>
-bst_node_t GetLeafIndex(MultiTargetTreeView const &tree, const RegTree::FVec &feat,
+bst_node_t GetLeafIndex(tree::MultiTargetTreeView const &tree, const RegTree::FVec &feat,
                         RegTree::CategoricalSplitMatrix const &cats,
                         bst_node_t nidx) {
   while (!tree.IsLeaf(nidx)) {
     bst_feature_t split_index = tree.SplitIndex(nidx);
     auto fvalue = feat.GetFvalue(split_index);
-    nidx = GetNextNodeMulti<has_missing, has_categorical>(
+    nidx = GetNextNode<has_missing, has_categorical>(
         tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
   }
   return nidx;
 }
 
 template <bool has_categorical>
-void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTreeView const &tree,
+void PredValueByOneTree(RegTree::FVec const &p_feats, tree::MultiTargetTreeView const &tree,
                         RegTree::CategoricalSplitMatrix const &cats,
                         linalg::VectorView<float> out_predt, bst_node_t nidx) {
   bst_node_t const leaf = p_feats.HasMissing()
@@ -121,14 +120,13 @@ void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTreeView const
 }
 
 template <bool has_categorical, bool any_missing, bool use_array_tree_layout>
-void PredValueByOneTree(Context const *ctx, const RegTree &tree, std::size_t const predict_offset,
+void PredValueByOneTree(const RegTree &tree, std::size_t const predict_offset,
                         common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
                         linalg::MatrixView<float> out_predt, bst_node_t *p_nidx, bst_node_t depth) {
-  const auto mt_tree = tree.GetMultiTargetTree()->View(ctx);
+  const auto mt_tree = tree.HostMtView();
   auto const &cats = tree.GetCategoriesMatrix();
   if constexpr (use_array_tree_layout) {
-    ProcessArrayTree<has_categorical, any_missing>(tree, cats, fvec_tloc, block_size, p_nidx,
-                                                   depth);
+    ProcessArrayTree<has_categorical, any_missing>(tree, fvec_tloc, block_size, p_nidx, depth);
   }
   for (std::size_t i = 0; i < block_size; ++i) {
     bst_node_t nidx = 0;
@@ -159,13 +157,12 @@ void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree
 
     int depth = use_array_tree_layout ? tree_depth[tree_id - tree_begin] : 0;
     if (tree.IsMultiTarget()) {
-      auto ctx = model.Ctx();
       if (has_categorical) {
         multi::PredValueByOneTree<true, any_missing, use_array_tree_layout>(
-            ctx, tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
+            tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
       } else {
         multi::PredValueByOneTree<false, any_missing, use_array_tree_layout>(
-            ctx, tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
+            tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
       }
     } else {
       auto const gid = model.tree_info[tree_id];
@@ -735,8 +732,8 @@ class ColumnSplitHelper {
 
       auto const fvalue = feat.GetFvalue(split_index);
       auto const decision = tree.HasCategoricalSplit()
-                                ? GetDecision<true>(node, nid, fvalue, cats)
-                                : GetDecision<false>(node, nid, fvalue, cats);
+                                ? GetDecision<true>(tree, nid, fvalue, cats)
+                                : GetDecision<false>(tree, nid, fvalue, cats);
       if (decision) {
         decision_bits_.Set(bit_index);
       }
@@ -1064,8 +1061,8 @@ class CPUPredictor : public Predictor {
             auto const &cats = tree.GetCategoriesMatrix();
             bst_node_t nidx = 0;
             if (tree.IsMultiTarget()) {
-              nidx = multi::GetLeafIndex<true, true>(tree.GetMultiTargetTree()->View(this->ctx_),
-                                                     fvec_tloc.front(), cats, nidx);
+              nidx =
+                  multi::GetLeafIndex<true, true>(tree.HostMtView(), fvec_tloc.front(), cats, nidx);
             } else {
               nidx = scalar::GetLeafIndex<true, true>(tree, fvec_tloc.front(), cats, nidx);
             }
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 5f962d399a1a..4147c49d9653 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -10,6 +10,7 @@
 
 #include "../collective/allreduce.h"
 #include "../common/bitfield.h"
+#include "../tree/tree_view.h"
 #include "../common/categorical.h"
 #include "../common/common.h"
 #include "../common/cuda_context.cuh"  // for CUDAContext
@@ -37,38 +38,33 @@ DMLC_REGISTRY_FILE_TAG(gpu_predictor);
 
 using cuda_impl::StaticBatch;
 
-struct TreeView {
-  RegTree::CategoricalSplitMatrix cats;
-  common::Span<RegTree::Node const> d_tree;
+XGBOOST_DEVICE auto MakeScalarTreeView(
+    bst_tree_t tree_begin, bst_tree_t tree_idx, common::Span<const RegTree::Node> d_nodes,
+    common::Span<size_t const> d_tree_segments, common::Span<FeatureType const> d_tree_split_types,
+    common::Span<uint32_t const> d_cat_tree_segments,
+    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
+    common::Span<uint32_t const> d_categories) {
+  auto begin = d_tree_segments[tree_idx - tree_begin];
+  auto n_nodes =
+      d_tree_segments[tree_idx - tree_begin + 1] - d_tree_segments[tree_idx - tree_begin];
 
-  XGBOOST_DEVICE
-  TreeView(bst_tree_t tree_begin, bst_tree_t tree_idx, common::Span<const RegTree::Node> d_nodes,
-           common::Span<size_t const> d_tree_segments,
-           common::Span<FeatureType const> d_tree_split_types,
-           common::Span<uint32_t const> d_cat_tree_segments,
-           common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-           common::Span<uint32_t const> d_categories) {
-    auto begin = d_tree_segments[tree_idx - tree_begin];
-    auto n_nodes = d_tree_segments[tree_idx - tree_begin + 1] -
-                   d_tree_segments[tree_idx - tree_begin];
-
-    d_tree = d_nodes.subspan(begin, n_nodes);
-
-    auto tree_cat_ptrs = d_cat_node_segments.subspan(begin, n_nodes);
-    auto tree_split_types = d_tree_split_types.subspan(begin, n_nodes);
-
-    auto tree_categories =
-        d_categories.subspan(d_cat_tree_segments[tree_idx - tree_begin],
-                             d_cat_tree_segments[tree_idx - tree_begin + 1] -
-                                 d_cat_tree_segments[tree_idx - tree_begin]);
-
-    cats.split_type = tree_split_types;
-    cats.categories = tree_categories;
-    cats.node_ptr = tree_cat_ptrs;
-  }
+  common::Span<RegTree::Node const> d_tree = d_nodes.subspan(begin, n_nodes);
 
-  [[nodiscard]] __device__ bool HasCategoricalSplit() const { return !cats.categories.empty(); }
-};
+  auto tree_cat_ptrs = d_cat_node_segments.subspan(begin, n_nodes);
+  auto tree_split_types = d_tree_split_types.subspan(begin, n_nodes);
+
+  auto tree_categories = d_categories.subspan(
+      d_cat_tree_segments[tree_idx - tree_begin],
+      d_cat_tree_segments[tree_idx - tree_begin + 1] - d_cat_tree_segments[tree_idx - tree_begin]);
+
+  RegTree::CategoricalSplitMatrix cats;
+  cats.split_type = tree_split_types;
+  cats.categories = tree_categories;
+  cats.node_ptr = tree_cat_ptrs;
+
+  auto tree = tree::ScalarTreeView{d_tree.data(), nullptr, cats, static_cast<bst_node_t>(n_nodes)};
+  return tree;
+}
 
 struct SparsePageView {
   common::Span<const Entry> d_data;
@@ -246,7 +242,7 @@ struct DeviceAdapterLoader {
 
 namespace multi {
 template <bool has_missing, bool has_categorical>
-XGBOOST_DEVICE bst_node_t GetNextNode(MultiTargetTreeView const& tree, bst_node_t const nidx,
+XGBOOST_DEVICE bst_node_t GetNextNode(tree::MultiTargetTreeView const& tree, bst_node_t const nidx,
                                       float fvalue, bool is_missing) {
   if (has_missing && is_missing) {
     return tree.DefaultChild(nidx);
@@ -256,7 +252,7 @@ XGBOOST_DEVICE bst_node_t GetNextNode(MultiTargetTreeView const& tree, bst_node_
 }
 
 template <bool has_missing, bool has_categorical, typename Loader>
-__device__ bst_node_t GetLeafIndex(bst_idx_t ridx, MultiTargetTreeView const& tree,
+__device__ bst_node_t GetLeafIndex(bst_idx_t ridx, tree::MultiTargetTreeView const& tree,
                                    Loader* loader) {
   bst_node_t nidx = 0;
   while (!tree.IsLeaf(nidx)) {
@@ -270,14 +266,15 @@ __device__ bst_node_t GetLeafIndex(bst_idx_t ridx, MultiTargetTreeView const& tr
 }
 
 template <bool has_missing, typename Loader>
-__device__ auto GetLeafWeight(bst_idx_t ridx, MultiTargetTreeView const& tree, Loader* loader) {
+__device__ auto GetLeafWeight(bst_idx_t ridx, tree::MultiTargetTreeView const& tree,
+                              Loader* loader) {
   bst_node_t nidx = GetLeafIndex<has_missing, false>(ridx, tree, loader);
   return tree.LeafValue(nidx);
 }
 
 template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
-__global__ void PredictKernel(Data data, common::Span<MultiTargetTreeView> trees, bool use_shared,
-                              float missing, linalg::MatrixView<float> d_out_predt,
+__global__ void PredictKernel(Data data, common::Span<tree::MultiTargetTreeView> trees,
+                              bool use_shared, float missing, linalg::MatrixView<float> d_out_predt,
                               EncAccessor acc) {
   for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), data.NumRows())) {
     Loader loader{std::move(data), use_shared, static_cast<bst_feature_t>(data.NumCols()),
@@ -293,20 +290,18 @@ __global__ void PredictKernel(Data data, common::Span<MultiTargetTreeView> trees
 }  // namespace multi
 
 namespace scalar {
-template <bool has_missing, bool has_categorical, typename Loader>
+template <bool has_missing, bool has_categorical, typename Loader, typename TreeView>
 __device__ bst_node_t GetLeafIndex(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
   bst_node_t nidx = 0;
-  RegTree::Node n = tree.d_tree[nidx];
-  while (!n.IsLeaf()) {
-    float fvalue = loader->GetElement(ridx, n.SplitIndex());
+  while (!tree.IsLeaf(nidx)) {
+    float fvalue = loader->GetElement(ridx, tree.SplitIndex(nidx));
     bool is_missing = common::CheckNAN(fvalue);
-    nidx = GetNextNode<has_missing, has_categorical>(n, nidx, fvalue, is_missing, tree.cats);
-    n = tree.d_tree[nidx];
+    nidx = GetNextNode<has_missing, has_categorical>(tree, nidx, fvalue, is_missing, tree.cats);
   }
   return nidx;
 }
 
-template <bool has_missing, typename Loader>
+template <bool has_missing, typename Loader, typename TreeView>
 __device__ float GetLeafWeight(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
   bst_node_t nidx = -1;
   if (tree.HasCategoricalSplit()) {
@@ -314,7 +309,7 @@ __device__ float GetLeafWeight(bst_idx_t ridx, TreeView const& tree, Loader* loa
   } else {
     nidx = GetLeafIndex<has_missing, false>(ridx, tree, loader);
   }
-  return tree.d_tree[nidx].LeafValue();
+  return tree.LeafValue(nidx);
 }
 }  // namespace scalar
 
@@ -338,10 +333,10 @@ PredictLeafKernel(Data data, common::Span<const RegTree::Node> d_nodes,
   }
   Loader loader{std::move(data), use_shared, num_features, num_rows, missing, std::move(acc)};
   for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-    TreeView d_tree{
+    auto d_tree = MakeScalarTreeView(
         tree_begin,          tree_idx,           d_nodes,
         d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-        d_cat_node_segments, d_categories};
+        d_cat_node_segments, d_categories);
 
     bst_node_t leaf = -1;
     if (d_tree.HasCategoricalSplit()) {
@@ -372,10 +367,10 @@ PredictKernel(Data data, common::Span<const RegTree::Node> d_nodes,
   if (num_group == 1) {
     float sum = 0;
     for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      TreeView d_tree{
+      auto d_tree = MakeScalarTreeView(
           tree_begin,          tree_idx,           d_nodes,
           d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-          d_cat_node_segments, d_categories};
+          d_cat_node_segments, d_categories);
       float leaf = scalar::GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
       sum += leaf;
     }
@@ -383,9 +378,9 @@ PredictKernel(Data data, common::Span<const RegTree::Node> d_nodes,
   } else {
     for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
       int tree_group = d_tree_group[tree_idx];
-      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                      d_cat_node_segments, d_categories};
+      auto d_tree =
+          MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
+                             d_cat_tree_segments, d_cat_node_segments, d_categories);
       bst_uint out_prediction_idx = global_idx * num_group + tree_group;
       d_out_predictions[out_prediction_idx] +=
           scalar::GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
@@ -648,9 +643,8 @@ void ExtractPaths(Context const* ctx,
   dh::LaunchN(info.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
     auto path_info = d_info[idx];
     size_t tree_offset = d_tree_segments[path_info.tree_idx];
-    TreeView tree{0,                   path_info.tree_idx, d_nodes,
-                  d_tree_segments,     d_split_types,      d_cat_segments,
-                  d_cat_node_segments, d_model_categories};
+    auto tree = MakeScalarTreeView(0, path_info.tree_idx, d_nodes, d_tree_segments, d_split_types,
+                                   d_cat_segments, d_cat_node_segments, d_model_categories);
     int group = d_tree_group[path_info.tree_idx];
     size_t child_idx = path_info.leaf_position;
     auto child = d_nodes[child_idx];
@@ -662,11 +656,11 @@ void ExtractPaths(Context const* ctx,
       double child_cover = d_stats[child_idx].sum_hess;
       double parent_cover = d_stats[parent_idx].sum_hess;
       double zero_fraction = child_cover / parent_cover;
-      auto parent = tree.d_tree[child.Parent()];
+      auto pnidx = child.Parent();
 
-      bool is_left_path = (tree_offset + parent.LeftChild()) == child_idx;
-      bool is_missing_path = (!parent.DefaultLeft() && !is_left_path) ||
-                             (parent.DefaultLeft() && is_left_path);
+      bool is_left_path = (tree_offset + tree.LeftChild(pnidx)) == child_idx;
+      bool is_missing_path =
+          (!tree.DefaultLeft(pnidx) && !is_left_path) || (tree.DefaultLeft(pnidx) && is_left_path);
 
       float lower_bound = -inf;
       float upper_bound = inf;
@@ -681,16 +675,16 @@ void ExtractPaths(Context const* ctx,
         }
         bits = common::CatBitField{path_cats};
       } else {
-        lower_bound = is_left_path ? -inf : parent.SplitCond();
-        upper_bound = is_left_path ? parent.SplitCond() : inf;
+        lower_bound = is_left_path ? -inf : tree.SplitCond(pnidx);
+        upper_bound = is_left_path ? tree.SplitCond(pnidx) : inf;
       }
       d_paths[output_position--] =
           gpu_treeshap::PathElement<ShapSplitCondition>{
-              idx,           parent.SplitIndex(),
+              idx,           tree.SplitIndex(pnidx),
               group,         ShapSplitCondition{lower_bound, upper_bound, is_missing_path, bits},
               zero_fraction, v};
       child_idx = parent_idx;
-      child = parent;
+      child = d_nodes[child_idx];
     }
     // Root node has feature -1
     d_paths[output_position] = {idx, -1, group, ShapSplitCondition{-inf, inf, false, {}}, 1.0, v};
@@ -728,58 +722,58 @@ __global__ void MaskBitVectorKernel(
 
   std::size_t tree_offset = 0;
   for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-    TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                    d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                    d_cat_node_segments, d_categories};
-    auto const tree_nodes = d_tree.d_tree.size();
+    auto d_tree = MakeScalarTreeView(tree_begin,          tree_idx,           d_nodes,
+                                     d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                                     d_cat_node_segments, d_categories);
+    auto const tree_nodes = d_tree.Size();
     for (auto nid = 0; nid < tree_nodes; nid++) {
-      auto const& node = d_tree.d_tree[nid];
-      if (node.IsDeleted() || node.IsLeaf()) {
+      if (d_tree.IsDeleted(nid) || d_tree.IsLeaf(nid)) {
           continue;
       }
-      auto const fvalue = loader.GetElement(row_idx, node.SplitIndex());
+      auto const fvalue = loader.GetElement(row_idx, d_tree.SplitIndex(nid));
       auto const is_missing = common::CheckNAN(fvalue);
       auto const bit_index = row_idx * num_nodes + tree_offset + nid;
       if (is_missing) {
           missing_bits.Set(bit_index);
       } else {
-          auto const decision = d_tree.HasCategoricalSplit()
-                                    ? GetDecision<true>(node, nid, fvalue, d_tree.cats)
-                                    : GetDecision<false>(node, nid, fvalue, d_tree.cats);
-          if (decision) {
-            decision_bits.Set(bit_index);
-          }
+        auto const decision =
+            d_tree.HasCategoricalSplit()
+                ? GetDecision<true>(d_tree, nid, fvalue, d_tree.GetCategoriesMatrix())
+                : GetDecision<false>(d_tree, nid, fvalue, d_tree.GetCategoriesMatrix());
+        if (decision) {
+          decision_bits.Set(bit_index);
+        }
       }
     }
     tree_offset += tree_nodes;
   }
 }
 
+template <typename TreeView>
 __device__ bst_node_t GetLeafIndexByBitVector(bst_idx_t ridx, TreeView const& tree,
                                               BitVector const& decision_bits,
                                               BitVector const& missing_bits, std::size_t num_nodes,
                                               std::size_t tree_offset) {
   bst_node_t nidx = 0;
-  RegTree::Node n = tree.d_tree[nidx];
-  while (!n.IsLeaf()) {
+  while (!tree.IsLeaf(nidx)) {
     auto const bit_index = ridx * num_nodes + tree_offset + nidx;
     if (missing_bits.Check(bit_index)) {
-      nidx = n.DefaultChild();
+      nidx = tree.DefaultChild(nidx);
     } else {
-      nidx = n.LeftChild() + !decision_bits.Check(bit_index);
+      nidx = tree.LeftChild(nidx) + !decision_bits.Check(bit_index);
     }
-    n = tree.d_tree[nidx];
   }
   return nidx;
 }
 
+template <typename TreeView>
 __device__ float GetLeafWeightByBitVector(bst_idx_t ridx, TreeView const& tree,
                                           BitVector const& decision_bits,
                                           BitVector const& missing_bits, std::size_t num_nodes,
                                           std::size_t tree_offset) {
   auto const nidx =
       GetLeafIndexByBitVector(ridx, tree, decision_bits, missing_bits, num_nodes, tree_offset);
-  return tree.d_tree[nidx].LeafValue();
+  return tree.LeafValue(nidx);
 }
 
 template <bool predict_leaf>
@@ -800,36 +794,36 @@ __global__ void PredictByBitVectorKernel(
   std::size_t tree_offset = 0;
   if constexpr (predict_leaf) {
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                      d_cat_node_segments, d_categories};
+      auto d_tree =
+          MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
+                             d_cat_tree_segments, d_cat_node_segments, d_categories);
       auto const leaf = GetLeafIndexByBitVector(row_idx, d_tree, decision_bits, missing_bits,
                                                 num_nodes, tree_offset);
       d_out_predictions[row_idx * (tree_end - tree_begin) + tree_idx] = static_cast<float>(leaf);
-      tree_offset += d_tree.d_tree.size();
+      tree_offset += d_tree.Size();
     }
   } else {
     if (num_group == 1) {
       float sum = 0;
       for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                          d_cat_node_segments, d_categories};
-          sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
-                                          tree_offset);
-          tree_offset += d_tree.d_tree.size();
+        auto d_tree =
+            MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
+                               d_cat_tree_segments, d_cat_node_segments, d_categories);
+        sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
+                                        tree_offset);
+        tree_offset += d_tree.Size();
       }
       d_out_predictions[row_idx] += sum;
     } else {
       for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-          auto const tree_group = d_tree_group[tree_idx];
-          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                          d_cat_node_segments, d_categories};
-          bst_uint out_prediction_idx = row_idx * num_group + tree_group;
-          d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
-              row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
-          tree_offset += d_tree.d_tree.size();
+        auto const tree_group = d_tree_group[tree_idx];
+        auto d_tree =
+            MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
+                               d_cat_tree_segments, d_cat_node_segments, d_categories);
+        bst_uint out_prediction_idx = row_idx * num_group + tree_group;
+        d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
+            row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
+        tree_offset += d_tree.Size();
       }
     }
   }
@@ -1047,11 +1041,11 @@ class LaunchConfig {
                           bst_idx_t batch_offset, HostDeviceVector<float>* predictions) const {
     CHECK_EQ(batch_offset, 0);  // External memory is not supported yet.
     CHECK_GT(tree_end, tree_begin);
-    std::vector<MultiTargetTreeView> h_trees;
+    std::vector<tree::MultiTargetTreeView> h_trees;
     for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      h_trees.emplace_back(model.trees[tree_idx]->GetMultiTargetTree()->View(ctx));
+      h_trees.emplace_back(ctx, model.trees[tree_idx].get());
     }
-    dh::device_vector<MultiTargetTreeView> trees = h_trees;
+    dh::device_vector<tree::MultiTargetTreeView> trees = h_trees;
     CHECK_GE(predictions->Size(), data.NumRows() * h_trees.front().NumTargets());
     auto kernel = multi::PredictKernel<Loader<NoOpAccessor>, Data, true, NoOpAccessor>;
     auto predt =
diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h
index 81198ef60bbb..372647e032ac 100644
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -12,43 +12,25 @@
 
 namespace xgboost::predictor {
 /** @brief Whether it should traverse to the left branch of a tree. */
-template <bool has_categorical>
-XGBOOST_DEVICE bool GetDecision(RegTree::Node const &node, bst_node_t nid, float fvalue,
+template <bool has_categorical, typename TreeView>
+XGBOOST_DEVICE bool GetDecision(TreeView const &tree, bst_node_t nid, float fvalue,
                                 RegTree::CategoricalSplitMatrix const &cats) {
   if (has_categorical && common::IsCat(cats.split_type, nid)) {
     auto node_categories = cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
     return common::Decision(node_categories, fvalue);
   } else {
-    return fvalue < node.SplitCond();
+    return fvalue < tree.SplitCond(nid);
   }
 }
 
-template <bool has_missing, bool has_categorical>
-XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid, float fvalue,
+template <bool has_missing, bool has_categorical, typename TreeView>
+XGBOOST_DEVICE bst_node_t GetNextNode(TreeView const &tree, const bst_node_t nid, float fvalue,
                                       bool is_missing,
                                       RegTree::CategoricalSplitMatrix const &cats) {
   if (has_missing && is_missing) {
-    return node.DefaultChild();
+    return tree.DefaultChild(nid);
   } else {
-    return node.LeftChild() + !GetDecision<has_categorical>(node, nid, fvalue, cats);
-  }
-}
-
-template <bool has_missing, bool has_categorical>
-XGBOOST_DEVICE bst_node_t GetNextNodeMulti(MultiTargetTreeView const &tree, bst_node_t const nidx,
-                                           float fvalue, bool is_missing,
-                                           RegTree::CategoricalSplitMatrix const &cats) {
-  if (has_missing && is_missing) {
-    return tree.DefaultChild(nidx);
-  } else {
-    if (has_categorical && common::IsCat(cats.split_type, nidx)) {
-      auto node_categories =
-          cats.categories.subspan(cats.node_ptr[nidx].beg, cats.node_ptr[nidx].size);
-      return common::Decision(node_categories, fvalue) ? tree.LeftChild(nidx)
-                                                       : tree.RightChild(nidx);
-    } else {
-      return tree.LeftChild(nidx) + !(fvalue < tree.SplitCond(nidx));
-    }
+    return tree.LeftChild(nid) + !GetDecision<has_categorical>(tree, nid, fvalue, cats);
   }
 }
 
diff --git a/src/predictor/treeshap.cc b/src/predictor/treeshap.cc
index 100e3f4611d7..0e730524ada1 100644
--- a/src/predictor/treeshap.cc
+++ b/src/predictor/treeshap.cc
@@ -29,7 +29,7 @@ void CalculateContributionsApprox(RegTree const& tree, const RegTree::FVec& feat
 
   while (!tree[nid].IsLeaf()) {
     split_index = tree[nid].SplitIndex();
-    nid = predictor::GetNextNode<true, true>(tree[nid], nid, feat.GetFvalue(split_index),
+    nid = predictor::GetNextNode<true, true>(tree, nid, feat.GetFvalue(split_index),
                                              feat.IsMissing(split_index), cats);
     bst_float new_value = (*mean_values)[nid];
     // update feature weight
@@ -169,7 +169,7 @@ void TreeShap(RegTree const& tree, const RegTree::FVec& feat, float* phi, bst_no
     // find which branch is "hot" (meaning x would follow it)
     auto const& cats = tree.GetCategoriesMatrix();
     bst_node_t hot_index = predictor::GetNextNode<true, true>(
-        node, node_index, feat.GetFvalue(split_index), feat.IsMissing(split_index), cats);
+        tree, node_index, feat.GetFvalue(split_index), feat.IsMissing(split_index), cats);
 
     const auto cold_index = (hot_index == node.LeftChild() ? node.RightChild() : node.LeftChild());
     const float w = tree.Stat(node_index).sum_hess;
diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc
index 80d30757ef61..eeae410803c7 100644
--- a/src/tree/multi_target_tree_model.cc
+++ b/src/tree/multi_target_tree_model.cc
@@ -48,7 +48,6 @@ MultiTargetTree::MultiTargetTree(MultiTargetTree const& that)
   this->weights_.Copy(that.weights_);
 }
 
-
 void MultiTargetTree::SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
   CHECK(this->IsLeaf(nidx)) << "Collapsing a split node to leaf " << MTNotImplemented();
   auto const next_nidx = nidx + 1;
@@ -161,38 +160,6 @@ void LoadModelImpl(Json const& in, HostDeviceVector<float>* p_weights,
   }
 }
 
-MultiTargetTreeView MultiTargetTree::View(Context const* ctx) const {
-  CHECK_GE(this->NumTargets(), 2);
-  CHECK_EQ(this->left_.Size(), this->right_.Size());
-  CHECK_EQ(this->left_.Size(), this->parent_.Size());
-
-  auto device = ctx->Device();
-  auto n = this->left_.Size();
-
-  // Data copies between host and device can introduce race.
-  std::lock_guard guard{this->tree_view_lock_};
-
-  auto make_ten = [&](common::Span<float const> weights) {
-    auto n_targets = this->NumTargets();
-    auto n_leaves = this->weights_.Size() / this->NumTargets();
-    CHECK_GE(n_leaves, 1);
-    return linalg::MakeTensorView(ctx, weights, n_leaves, n_targets);
-  };
-
-  auto make_tr = [&](auto const&... args) -> MultiTargetTreeView {
-    if (device.IsCPU()) {
-      return {(args.ConstHostPointer())..., n, make_ten(this->weights_.ConstHostSpan())};
-    }
-
-    (args.SetDevice(device), ...);
-    this->weights_.SetDevice(device);
-    return {(args.ConstDevicePointer())..., n, make_ten(this->weights_.ConstDeviceSpan())};
-  };
-
-  return make_tr(this->left_, this->right_, this->parent_, this->split_index_, this->default_left_,
-                 this->split_conds_);
-}
-
 void MultiTargetTree::LoadModel(Json const& in) {
   namespace tf = tree_field;
   bool typed = IsA<F32Array>(in[tf::kBaseWeight]);
@@ -277,10 +244,15 @@ void MultiTargetTree::SaveModel(Json* p_out) const {
   out[tf::kDftLeft] = std::move(default_left);
 }
 
-
 bst_target_t MultiTargetTree::NumTargets() const { return param_->size_leaf_vector; }
 std::size_t MultiTargetTree::Size() const { return parent_.Size(); }
 
+[[nodiscard]] MultiTargetTree* MultiTargetTree::Copy(TreeParam const* param) const {
+  auto ptr = new MultiTargetTree{*this};
+  ptr->param_ = param;
+  return ptr;
+}
+
 [[nodiscard]] std::size_t MultiTargetTree::MemCostBytes() const {
   std::size_t n_bytes = 0;
   n_bytes += left_.SizeBytes();
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index bb0b1d2b5284..52979be1ec48 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -18,6 +18,7 @@
 #include "../common/common.h"       // for EscapeU8
 #include "io_utils.h"               // for GetElem
 #include "param.h"
+#include "tree_view.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
@@ -826,7 +827,8 @@ bool RegTree::Equal(const RegTree& b) const {
   }
   auto const& self = *this;
   bool ret { true };
-  this->WalkTree([&self, &b, &ret](bst_node_t nidx) {
+  auto sc_tree = this->HostScView();
+  sc_tree.WalkTree([&self, &b, &ret](bst_node_t nidx) {
     if (!(self.nodes_.at(nidx) == b.nodes_.at(nidx))) {
       ret = false;
       return false;
@@ -836,32 +838,35 @@ bool RegTree::Equal(const RegTree& b) const {
   return ret;
 }
 
-bst_node_t RegTree::GetNumLeaves() const {
-  CHECK(!IsMultiTarget());
-  bst_node_t leaves { 0 };
-  auto const& self = *this;
-  this->WalkTree([&leaves, &self](bst_node_t nidx) {
-                   if (self[nidx].IsLeaf()) {
-                     leaves++;
-                   }
-                   return true;
-                 });
+[[nodiscard]] bst_node_t RegTree::GetNumLeaves() const {
+  bst_node_t leaves{0};
+  tree::WalkTree(*this, [&leaves](auto const& tree, bst_node_t nidx) {
+    if (tree.IsLeaf(nidx)) {
+      leaves++;
+    }
+    return true;
+  });
   return leaves;
 }
 
-bst_node_t RegTree::GetNumSplitNodes() const {
-  CHECK(!IsMultiTarget());
-  bst_node_t splits { 0 };
-  auto const& self = *this;
-  this->WalkTree([&splits, &self](bst_node_t nidx) {
-                   if (!self[nidx].IsLeaf()) {
-                     splits++;
-                   }
-                   return true;
-                 });
+[[nodiscard]] bst_node_t RegTree::GetNumSplitNodes() const {
+  bst_node_t splits{0};
+  tree::WalkTree(*this, [&splits](auto const& tree, bst_node_t nidx) {
+    if (!tree.IsLeaf(nidx)) {
+      splits++;
+    }
+    return true;
+  });
   return splits;
 }
 
+[[nodiscard]] bst_node_t RegTree::GetDepth(bst_node_t nidx) const {
+  if (this->IsMultiTarget()) {
+    return this->HostMtView().GetDepth(nidx);
+  }
+  return this->HostScView().GetDepth(nidx);
+}
+
 void RegTree::ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value,
                          bool default_left, bst_float base_weight,
                          bst_float left_leaf_weight,
@@ -927,6 +932,31 @@ void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
   this->split_categories_segments_.at(nidx).size = split_cat.size();
 }
 
+RegTree* RegTree::Copy() const {
+  auto ptr = new RegTree{};
+  ptr->param_ = this->param_;
+
+  auto copy = [](auto* lhs, auto const& rhs) {
+    *lhs = rhs;
+  };
+
+  copy(&ptr->nodes_, this->nodes_);
+  ptr->deleted_nodes_ = this->deleted_nodes_;
+  copy(&ptr->stats_, this->stats_);
+  copy(&ptr->split_types_, this->split_types_);
+  copy(&ptr->split_categories_, this->split_categories_);
+  copy(&ptr->split_categories_segments_, this->split_categories_segments_);
+
+  if (this->p_mt_tree_) {
+    ptr->p_mt_tree_.reset(this->p_mt_tree_->Copy(&ptr->param_));
+  }
+  return ptr;
+}
+
+tree::ScalarTreeView RegTree::HostScView() const { return tree::ScalarTreeView{this}; }
+
+tree::MultiTargetTreeView RegTree::HostMtView() const { return tree::MultiTargetTreeView{this}; }
+
 template <bool typed>
 void RegTree::LoadCategoricalSplit(Json const& in) {
   auto const& categories_segments = get<I64ArrayT<typed>>(in["categories_segments"]);
diff --git a/src/tree/tree_view.cc b/src/tree/tree_view.cc
new file mode 100644
index 000000000000..4da9635d1271
--- /dev/null
+++ b/src/tree/tree_view.cc
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include "tree_view.h"
+
+#include "xgboost/context.h"             // for Context
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/linalg.h"              // for MakeTensorView
+#include "xgboost/span.h"                // for Span
+
+namespace xgboost::tree {
+namespace {
+template <typename T>
+auto DispatchPtr(Context const* ctx, HostDeviceVector<T> const& vec) {
+  if (ctx->IsCPU()) {
+    return vec.ConstHostPointer();
+  }
+  vec.SetDevice(ctx->Device());
+  return vec.ConstDevicePointer();
+}
+
+auto DispatchWeight(DeviceOrd device, RegTree const* tree) {
+  auto const* mt_tree = tree->GetMultiTargetTree();
+  auto n_targets = mt_tree->NumTargets();
+  auto n_leaves = mt_tree->NumLeaves();
+  CHECK_GE(n_leaves, 1);
+  common::Span<float const> weights = tree->GetMultiTargetTree()->Weights(device);
+  return linalg::MakeTensorView(device, weights, n_leaves, n_targets);
+}
+}  // namespace
+
+ScalarTreeView::ScalarTreeView(Context const* /*ctx*/, RegTree const* tree)
+    : nodes{tree->GetNodes().data()},
+      stats{tree->GetStats().data()},
+      cats{tree->GetCategoriesMatrix()},
+      n{tree->NumNodes()} {
+  CHECK(!tree->IsMultiTarget());
+}
+
+MultiTargetTreeView::MultiTargetTreeView(Context const* ctx, RegTree const* tree)
+    : left{DispatchPtr(ctx, tree->GetMultiTargetTree()->left_)},
+      right{DispatchPtr(ctx, tree->GetMultiTargetTree()->right_)},
+      parent{DispatchPtr(ctx, tree->GetMultiTargetTree()->parent_)},
+      split_index{DispatchPtr(ctx, tree->GetMultiTargetTree()->split_index_)},
+      default_left{DispatchPtr(ctx, tree->GetMultiTargetTree()->default_left_)},
+      split_conds{DispatchPtr(ctx, tree->GetMultiTargetTree()->split_conds_)},
+      cats{tree->GetCategoriesMatrix()},
+      n{tree->NumNodes()},
+      weights{DispatchWeight(ctx->Device(), tree)} {
+  CHECK(tree->IsMultiTarget());
+}
+
+MultiTargetTreeView::MultiTargetTreeView(RegTree const* tree)
+    : left{tree->GetMultiTargetTree()->left_.ConstHostPointer()},
+      right{tree->GetMultiTargetTree()->right_.ConstHostPointer()},
+      parent{tree->GetMultiTargetTree()->parent_.ConstHostPointer()},
+      split_index{tree->GetMultiTargetTree()->split_index_.ConstHostPointer()},
+      default_left{tree->GetMultiTargetTree()->default_left_.ConstHostPointer()},
+      split_conds{tree->GetMultiTargetTree()->split_conds_.ConstHostPointer()},
+      cats{tree->GetCategoriesMatrix()},
+      n{tree->NumNodes()},
+      weights{DispatchWeight(DeviceOrd::CPU(), tree)} {
+  CHECK(tree->IsMultiTarget());
+}
+}  // namespace xgboost::tree
diff --git a/src/tree/tree_view.h b/src/tree/tree_view.h
new file mode 100644
index 000000000000..03cf3bc921bd
--- /dev/null
+++ b/src/tree/tree_view.h
@@ -0,0 +1,211 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ *
+ * The file provides views for two tree models. We hope to eventually unify them, but the
+ * original scalar tree `Node` struct is used extensively in the codebase.
+ */
+#pragma once
+#include <cstdint>  // for uint8_t
+#include <stack>    // for stack
+#include <utility>  // for move
+
+#include "../common/type.h"      // for GetValueT
+#include "xgboost/base.h"        // for bst_node_t
+#include "xgboost/tree_model.h"  // for RegTree
+
+namespace xgboost::tree {
+template <typename Base>
+struct WalkTreeMixIn {
+  /**
+   * @brief Iterate through all nodes in this tree.
+   *
+   * @param Function that accepts a node index, and returns false when iteration should
+   *        stop, otherwise returns true.
+   */
+  template <typename Fn>
+  void WalkTree(Fn&& func) const {
+    std::stack<bst_node_t> nodes;
+    nodes.push(RegTree::kRoot);
+    auto self = static_cast<Base const*>(this);
+    while (!nodes.empty()) {
+      auto nidx = nodes.top();
+      nodes.pop();
+      if (!func(nidx)) {
+        return;
+      }
+      auto left = self->LeftChild(nidx);
+      auto right = self->RightChild(nidx);
+      if (left != RegTree::kInvalidNodeId) {
+        nodes.push(left);
+      }
+      if (right != RegTree::kInvalidNodeId) {
+        nodes.push(right);
+      }
+    }
+  }
+
+  /**
+   * @brief Get the depth of a node.
+   * @param nidx node id
+   */
+  [[nodiscard]] bst_node_t GetDepth(bst_node_t nidx) const {
+    bst_node_t depth = 0;
+    auto self = static_cast<Base const*>(this);
+    while (!self->IsRoot(nidx)) {
+      ++depth;
+      nidx = self->Parent(nidx);
+    }
+    return depth;
+  }
+};
+
+/**
+ * @brief Tree view for scalar leaf.
+ */
+struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView> {
+  static bst_node_t constexpr InvalidNodeId() { return RegTree::kInvalidNodeId; }
+  static constexpr bst_node_t RootId() { return RegTree::kRoot; }
+
+  RegTree::Node const* nodes;
+
+  RTreeNodeStat const* stats;
+  RegTree::CategoricalSplitMatrix cats;
+  // The number of nodes
+  bst_node_t n{0};
+
+  [[nodiscard]] XGBOOST_DEVICE bool IsLeaf(bst_node_t nidx) const { return nodes[nidx].IsLeaf(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t Parent(bst_node_t nidx) const {
+    return nodes[nidx].Parent();
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t LeftChild(bst_node_t nidx) const {
+    return nodes[nidx].LeftChild();
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t RightChild(bst_node_t nidx) const {
+    return nodes[nidx].RightChild();
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_feature_t SplitIndex(bst_node_t nidx) const {
+    return nodes[nidx].SplitIndex();
+  }
+  [[nodiscard]] XGBOOST_DEVICE bool IsDeleted(bst_node_t nidx) const {
+    return nodes[nidx].IsDeleted();
+  }
+  [[nodiscard]] XGBOOST_DEVICE float SplitCond(bst_node_t nidx) const {
+    return nodes[nidx].SplitCond();
+  }
+  [[nodiscard]] XGBOOST_DEVICE bool DefaultLeft(bst_node_t nidx) const {
+    return nodes[nidx].DefaultLeft();
+  }
+  [[nodiscard]] XGBOOST_DEVICE bool IsLeftChild(bst_node_t nidx) const {
+    return nodes[nidx].IsLeftChild();
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t DefaultChild(bst_node_t nidx) const {
+    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
+  }
+  [[nodiscard]] XGBOOST_DEVICE float LeafValue(bst_node_t nidx) const {
+    return this->nodes[nidx].LeafValue();
+  }
+
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t Size() const { return this->n; }
+  [[nodiscard]] XGBOOST_DEVICE bool IsRoot(bst_node_t nidx) const {
+    return this->nodes[nidx].IsRoot();
+  }
+  [[nodiscard]] RTreeNodeStat const& Stat(bst_node_t nidx) const { return stats[nidx]; }
+  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
+
+  [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
+  [[nodiscard]] XGBOOST_DEVICE RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const {
+    return cats;
+  }
+
+  XGBOOST_DEVICE explicit ScalarTreeView(RegTree::Node const* nodes, RTreeNodeStat const* stats,
+                                         RegTree::CategoricalSplitMatrix cats, bst_node_t n_nodes)
+      : nodes{nodes}, stats{stats}, cats{std::move(cats)}, n{n_nodes} {}
+
+  /** @brief Create a device view, not implemented yet. */
+  explicit ScalarTreeView(Context const* ctx, RegTree const* tree);
+  /** @brief Create a host view */
+  explicit ScalarTreeView(RegTree const* tree)
+      : nodes{tree->GetNodes().data()},
+        stats{tree->GetStats().data()},
+        cats{tree->GetCategoriesMatrix()},
+        n{tree->NumNodes()} {
+    CHECK(!tree->IsMultiTarget());
+  }
+};
+
+/**
+ * @brief A view to the @ref MultiTargetTree suitable for both host and device.
+ */
+struct MultiTargetTreeView : public WalkTreeMixIn<MultiTargetTreeView> {
+  static bst_node_t constexpr InvalidNodeId() { return MultiTargetTree::InvalidNodeId(); }
+
+  bst_node_t const* left;
+  bst_node_t const* right;
+  bst_node_t const* parent;
+
+  bst_feature_t const* split_index;
+  std::uint8_t const* default_left;
+  float const* split_conds;
+
+  RegTree::CategoricalSplitMatrix cats;
+
+  // The number of nodes
+  bst_node_t n{0};
+
+  linalg::MatrixView<float const> weights;
+
+  [[nodiscard]] XGBOOST_DEVICE bool IsLeaf(bst_node_t nidx) const {
+    return left[nidx] == InvalidNodeId();
+  }
+
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t Parent(bst_node_t nidx) const { return parent[nidx]; }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t LeftChild(bst_node_t nidx) const { return left[nidx]; }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t RightChild(bst_node_t nidx) const { return right[nidx]; }
+
+  [[nodiscard]] bool IsLeftChild(bst_node_t nidx) const {
+    auto p = this->Parent(nidx);
+    return nidx == this->LeftChild(p);
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_feature_t SplitIndex(bst_node_t nidx) const {
+    return split_index[nidx];
+  }
+  [[nodiscard]] XGBOOST_DEVICE float SplitCond(bst_node_t nidx) const { return split_conds[nidx]; }
+  [[nodiscard]] XGBOOST_DEVICE bool DefaultLeft(bst_node_t nidx) const {
+    return default_left[nidx];
+  }
+  [[nodiscard]] XGBOOST_DEVICE bst_node_t DefaultChild(bst_node_t nidx) const {
+    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
+  }
+  [[nodiscard]] XGBOOST_DEVICE linalg::VectorView<float const> LeafValue(bst_node_t nidx) const {
+    return this->weights.Slice(nidx, linalg::All());
+  }
+
+  [[nodiscard]] bst_target_t NumTargets() const { return this->weights.Shape(1); }
+  [[nodiscard]] bst_node_t Size() const { return this->n; }
+  [[nodiscard]] XGBOOST_DEVICE bool IsRoot(bst_node_t nidx) const { return nidx == RegTree::kRoot; }
+
+  [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
+  [[nodiscard]] RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const { return cats; }
+
+  /** @brief Create a device view */
+  explicit MultiTargetTreeView(Context const* ctx, RegTree const* tree);
+  /** @brief Create a host view */
+  explicit MultiTargetTreeView(RegTree const* tree);
+};
+
+template <typename Fn>
+void WalkTree(RegTree const& tree, Fn&& fn) {
+  if (tree.IsMultiTarget()) {
+    auto mt_tree = tree.HostMtView();
+    mt_tree.WalkTree([&](bst_node_t nidx) { return fn(mt_tree, nidx); });
+  } else {
+    auto sc_tree = tree.HostScView();
+    sc_tree.WalkTree([&](bst_node_t nidx) { return fn(sc_tree, nidx); });
+  }
+}
+
+template <typename TreeView>
+[[nodiscard]] bool constexpr IsScalarTree() {
+  return std::is_same_v<common::GetValueT<TreeView>, ScalarTreeView>;
+}
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 23c8ec9e6ea2..af1af2fb76f1 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2024, XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file updater_refresh.cc
  * \brief refresh the statistics and leaf value on the tree on the dataset
  * \author Tianqi Chen
@@ -12,6 +12,7 @@
 #include "../collective/allreduce.h"
 #include "../common/threading_utils.h"
 #include "../predictor/predict_fn.h"
+#include "../tree/tree_view.h"  // for ScalarTreeView
 #include "./param.h"
 #include "xgboost/json.h"
 
@@ -76,8 +77,7 @@ class TreeRefresher : public TreeUpdater {
           feats.Fill(inst);
           int offset = 0;
           for (auto tree : trees) {
-            AddStats(*tree, feats, gpair_h, info, ridx,
-                     dmlc::BeginPtr(stemp[tid]) + offset);
+            AddStats(*tree, feats, gpair_h, info, ridx, dmlc::BeginPtr(stemp[tid]) + offset);
             offset += tree->NumNodes();
           }
           feats.Drop();
@@ -107,26 +107,22 @@ class TreeRefresher : public TreeUpdater {
   }
 
  private:
-  inline static void AddStats(const RegTree &tree,
-                              const RegTree::FVec &feat,
-                              const std::vector<GradientPair> &gpair,
-                              const MetaInfo&,
-                              const bst_uint ridx,
-                              GradStats *gstats) {
+  inline static void AddStats(const RegTree &tree, const RegTree::FVec &feat,
+                              const std::vector<GradientPair> &gpair, const MetaInfo &,
+                              const bst_uint ridx, GradStats *gstats) {
     // start from groups that belongs to current data
     auto pid = 0;
     gstats[pid].Add(gpair[ridx]);
-    auto const& cats = tree.GetCategoriesMatrix();
     // traverse tree
-    while (!tree[pid].IsLeaf()) {
-      unsigned split_index = tree[pid].SplitIndex();
-      pid = predictor::GetNextNode<true, true>(
-          tree[pid], pid, feat.GetFvalue(split_index), feat.IsMissing(split_index),
-          cats);
+    auto sc_tree = tree.HostScView();
+    while (!sc_tree.IsLeaf(pid)) {
+      unsigned split_index = sc_tree.SplitIndex(pid);
+      pid = predictor::GetNextNode<true, true>(sc_tree, pid, feat.GetFvalue(split_index),
+                                               feat.IsMissing(split_index), sc_tree.cats);
       gstats[pid].Add(gpair[ridx]);
     }
   }
-  inline void Refresh(TrainParam const *param, const GradStats *gstats, int nid, RegTree *p_tree) {
+  void Refresh(TrainParam const *param, const GradStats *gstats, int nid, RegTree *p_tree) {
     RegTree &tree = *p_tree;
     tree.Stat(nid).base_weight =
         static_cast<bst_float>(CalcWeight(*param, gstats[nid]));
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index caa177509426..32bd07d71273 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -8,6 +8,7 @@
 #include "../../../src/data/adapter.h"
 #include "../../../src/data/proxy_dmatrix.h"
 #include "../../../src/predictor/array_tree_layout.h"
+#include "../../../src/tree/tree_view.h"
 #include "../../../src/gbm/gbtree.h"
 #include "../../../src/gbm/gbtree_model.h"
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
@@ -23,9 +24,9 @@ TEST(CpuPredictor, Basic) {
   TestBasic(dmat.get(), &ctx);
 }
 
-
 template <typename ArrayLayoutT>
-void CheckArrayLayout(const RegTree& tree, ArrayLayoutT buffer, int max_depth, int depth, size_t nid, size_t nid_array) {
+void CheckArrayLayout(const RegTree& tree, ArrayLayoutT buffer, int max_depth, int depth,
+                      size_t nid, size_t nid_array) {
   const auto& split_idx = buffer.SplitIndex();
   const auto& split_cond = buffer.SplitCond();
   const auto& default_left = buffer.DefaultLeft();
@@ -48,19 +49,26 @@ void CheckArrayLayout(const RegTree& tree, ArrayLayoutT buffer, int max_depth, i
     ASSERT_EQ(nodes[nid].DefaultLeft(), default_left[nid_array]);
 
     if (nodes[nid].LeftChild() != RegTree::kInvalidNodeId) {
-      CheckArrayLayout(tree, buffer, max_depth, depth + 1, nodes[nid].LeftChild(), 2 * nid_array + 1);
+      CheckArrayLayout(tree, buffer, max_depth, depth + 1, nodes[nid].LeftChild(),
+                       2 * nid_array + 1);
     }
     if (nodes[nid].RightChild() != RegTree::kInvalidNodeId) {
-      CheckArrayLayout(tree, buffer, max_depth, depth + 1, nodes[nid].RightChild(), 2 * nid_array + 2);
+      CheckArrayLayout(tree, buffer, max_depth, depth + 1, nodes[nid].RightChild(),
+                       2 * nid_array + 2);
     }
   }
 }
 
+namespace {
+template <bst_node_t kDepth>
+using LayoutForTest = predictor::ArrayTreeLayout<false, true, kDepth, tree::ScalarTreeView>;
+}
+
 TEST(CpuPredictor, ArrayTreeLayout) {
   Context ctx;
 
   RegTree tree;
-  size_t n_nodes = 15; // 2^4 - 1
+  size_t n_nodes = 15;  // 2^4 - 1
   for (size_t nid = 0; nid < n_nodes; ++nid) {
     // Some place-holders
     size_t split_index = nid + 1;
@@ -70,29 +78,30 @@ TEST(CpuPredictor, ArrayTreeLayout) {
     tree.ExpandNode(nid, split_index, split_cond, default_left, 0, 0, 0, 0, 0, 0, 0);
   }
 
+  auto sc_tree = tree::ScalarTreeView{&ctx, &tree};
   {
     constexpr int kDepth = 1;
-    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    LayoutForTest<kDepth> buffer(sc_tree, tree.GetCategoriesMatrix());
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 2;
-    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 3;
-    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 4;
-    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 5;
-    predictor::ArrayTreeLayout<false, true, kDepth> buffer(tree, tree.GetCategoriesMatrix());
+    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
 }
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index 79560f17f097..e0154869f937 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -7,6 +7,7 @@
 #include <xgboost/tree_model.h>  // for RegTree
 
 #include "../helpers.h"
+#include "../../../src/tree/tree_view.h"
 
 namespace xgboost {
 namespace {
@@ -80,26 +81,10 @@ TEST(MultiTargetTree, DumpDot) {
 
 TEST(MultiTargetTree, View) {
   auto tree = MakeTreeForTest();
-
-  auto test = [&tree](Context const* ctx) {
-    auto v = tree->GetMultiTargetTree()->View(ctx);
-    ASSERT_EQ(v.NumTargets(), 3);
-    ASSERT_EQ(v.Size(), 3);
-    if (ctx->IsCPU()) {
-      ASSERT_EQ(v.LeftChild(0), 1);
-      ASSERT_EQ(v.RightChild(0), 2);
-    }
-  };
-
-  {
-    Context ctx;
-    test(&ctx);
-  }
-#if defined(XGBOOST_USE_CUDA)
-  {
-    auto ctx = MakeCUDACtx(0);
-    test(&ctx);
-  }
-#endif  // defined(XGBOOST_USE_CUDA)
+  auto v = tree->HostMtView();
+  ASSERT_EQ(v.NumTargets(), 3);
+  ASSERT_EQ(v.Size(), 3);
+  ASSERT_EQ(v.LeftChild(0), 1);
+  ASSERT_EQ(v.RightChild(0), 2);
 }
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 27f04cb7a6b1..3a1d38ac13b1 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -3,6 +3,8 @@
  */
 #include <gtest/gtest.h>
 
+#include <stack>  // for stack
+
 #include "../../../src/common/bitfield.h"
 #include "../../../src/common/categorical.h"
 #include "../../../src/tree/io_utils.h"  // for DftBadValue
diff --git a/tests/cpp/tree/test_tree_policy.cc b/tests/cpp/tree/test_tree_policy.cc
index 50563be1d218..3e7807e9170b 100644
--- a/tests/cpp/tree/test_tree_policy.cc
+++ b/tests/cpp/tree/test_tree_policy.cc
@@ -1,9 +1,11 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>
 #include <xgboost/tree_model.h>
+
+#include "../../../src/tree/tree_view.h"  // for WalkTree
 #include "../helpers.h"
 
 namespace xgboost {
@@ -50,7 +52,7 @@ class TestGrowPolicy : public ::testing::Test {
       RegTree tree;
       tree.LoadModel(j_tree);
       bst_node_t depth = 0;
-      tree.WalkTree([&](bst_node_t nidx) {
+      tree::WalkTree(tree, [&](auto const& tree, bst_node_t nidx) {
         depth = std::max(tree.GetDepth(nidx), depth);
         return true;
       });
@@ -124,7 +126,7 @@ class TestGrowPolicy : public ::testing::Test {
       RegTree tree;
       tree.LoadModel(j_tree);
       bst_node_t depth = 0;
-      tree.WalkTree([&](bst_node_t nidx) {
+      tree::WalkTree(tree, [&](auto const& tree, bst_node_t nidx) {
         depth = std::max(tree.GetDepth(nidx), depth);
         return true;
       });
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index 15b59368499f..e965fe6711e2 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -9,8 +9,9 @@
 
 #include <memory>  // for unique_ptr
 
-#include "../../../src/tree/io_utils.h"  // for DftBadValue
-#include "../../../src/tree/param.h"     // for TrainParam
+#include "../../../src/tree/io_utils.h"   // for DftBadValue
+#include "../../../src/tree/param.h"      // for TrainParam
+#include "../../../src/tree/tree_view.h"  // for WalkTree
 #include "../helpers.h"
 
 namespace xgboost {
@@ -43,10 +44,11 @@ class UpdaterTreeStatTest : public ::testing::Test {
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     up->Update(&param, &gpairs_, p_dmat_.get(), position, {&tree});
 
-    tree.WalkTree([&tree](bst_node_t nidx) {
-      if (tree[nidx].IsLeaf()) {
+    auto sc_tree = tree.HostScView();
+    sc_tree.WalkTree([&sc_tree](bst_node_t nidx) {
+      if (sc_tree.IsLeaf(nidx)) {
         // 1.0 is the default `min_child_weight`.
-        CHECK_GE(tree.Stat(nidx).sum_hess, 1.0);
+        CHECK_GE(sc_tree.Stat(nidx).sum_hess, 1.0);
       }
       return true;
     });
@@ -117,7 +119,7 @@ class TestSplitWithEta : public ::testing::Test {
     CHECK_GE(p_tree0->NumExtraNodes(), 32);
 
     bst_node_t n_nodes{0};
-    p_tree0->WalkTree([&](bst_node_t nidx) {
+    tree::WalkTree(*p_tree0, [&](auto const&, bst_node_t nidx) {
       if (p_tree0->IsLeaf(nidx)) {
         CHECK(p_tree1->IsLeaf(nidx));
         if (p_tree0->IsMultiTarget()) {

From e0622eb9d9ab345f5a6aeb9ae50086be6121afa7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 13 Oct 2025 18:36:08 +0800
Subject: [PATCH 192/224] [jvm-packages] Remove tracker properties. (#11743)

---
 .../xgboost4j/java/TrackerProperties.java     | 56 -------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java
deleted file mode 100644
index 45a6b1e062a7..000000000000
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java
+++ /dev/null
@@ -1,56 +0,0 @@
-package ml.dmlc.xgboost4j.java;
-
-import java.io.*;
-import java.net.URL;
-import java.util.Properties;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-public class TrackerProperties {
-  private static String PROPERTIES_FILENAME = "xgboost-tracker.properties";
-  private static String HOST_IP = "host-ip";
-
-  private static final Log logger = LogFactory.getLog(TrackerProperties.class);
-  private static TrackerProperties instance = new TrackerProperties();
-
-  private Properties properties;
-
-  private TrackerProperties() {
-    this.properties = new Properties();
-
-    InputStream inputStream = null;
-
-    try {
-      URL propertiesFileURL =
-          Thread.currentThread().getContextClassLoader().getResource(PROPERTIES_FILENAME);
-      if (propertiesFileURL != null){
-        inputStream = propertiesFileURL.openStream();
-      }
-    } catch (IOException e) {
-      logger.warn("Could not load " + PROPERTIES_FILENAME + " file. ", e);
-    }
-
-    if(inputStream != null){
-      try {
-        properties.load(inputStream);
-        logger.debug("Loaded properties from external source");
-      } catch (IOException e) {
-        logger.error("Error loading tracker properties file. Skipping and using defaults. ", e);
-      }
-      try {
-        inputStream.close();
-      } catch (IOException e) {
-        // ignore exception
-      }
-    }
-  }
-
-  public static TrackerProperties getInstance() {
-    return instance;
-  }
-
-  public String getHostIp(){
-    return this.properties.getProperty(HOST_IP);
-  }
-}

From c8de58de77abba6dab7878640a30393f3abb86d7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 13 Oct 2025 22:06:37 +0800
Subject: [PATCH 193/224] Small refactor for the GPU predictor. (#11735)

---
 src/data/ellpack_page.cuh      |   1 +
 src/predictor/gpu_predictor.cu | 539 ++++++++++++++++-----------------
 2 files changed, 270 insertions(+), 270 deletions(-)

diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 95af7e4bd936..a6013853b1e6 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -159,6 +159,7 @@ struct EllpackAccessorImpl {
     return this->null_value_ & ((Ind() << NullShift()) - Ind());
   }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumRows() const { return n_rows; }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); }
 };
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 4147c49d9653..7e04305be20a 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -6,6 +6,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 
+#include <cuda/functional>   // for proclaim_return_type
+#include <cuda/std/utility>  // for swap
 #include <memory>
 
 #include "../collective/allreduce.h"
@@ -72,9 +74,17 @@ struct SparsePageView {
   bst_feature_t num_features;
 
   SparsePageView() = default;
-  XGBOOST_DEVICE SparsePageView(common::Span<const Entry> data,
-                                common::Span<const bst_idx_t> row_ptr, bst_feature_t n_features)
-      : d_data{data}, d_row_ptr{row_ptr}, num_features(n_features) {}
+  explicit SparsePageView(Context const* ctx, SparsePage const& page, bst_feature_t n_features)
+      : d_data{[&] {
+          page.data.SetDevice(ctx->Device());
+          return page.data.ConstDeviceSpan();
+        }()},
+        d_row_ptr{[&] {
+          page.offset.SetDevice(ctx->Device());
+          return page.offset.ConstDeviceSpan();
+        }()},
+        num_features{n_features} {}
+
   [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
     // Binary search
     auto begin_ptr = d_data.begin() + d_row_ptr[ridx];
@@ -109,6 +119,9 @@ struct SparsePageView {
 
 template <typename EncAccessor>
 struct SparsePageLoader {
+ public:
+  using SupportShmemLoad = std::true_type;
+
  private:
   EncAccessor acc_;
 
@@ -150,6 +163,9 @@ struct SparsePageLoader {
 
 template <typename Accessor, typename EncAccessor>
 struct EllpackLoader {
+ public:
+  using SupportShmemLoad = std::false_type;
+
   Accessor matrix;
   EncAccessor acc;
 
@@ -175,17 +191,14 @@ struct EllpackLoader {
   [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return this->matrix.n_rows; }
 };
 
-template <typename Accessor>
-struct EllpackPartial {
-  template <typename EncAccessor>
-  using Type = EllpackLoader<Accessor, EncAccessor>;
-};
-
 /**
  * @brief Use for in-place predict.
  */
 template <typename Batch, typename EncAccessor>
 struct DeviceAdapterLoader {
+ public:
+  using SupportShmemLoad = std::true_type;
+
  private:
   Batch batch_;
   EncAccessor acc_;
@@ -196,9 +209,6 @@ struct DeviceAdapterLoader {
   bool use_shared;
   data::IsValidFunctor is_valid;
 
-
-  using BatchT = Batch;
-
   XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch&& batch, bool use_shared, bst_feature_t n_features,
                                          bst_idx_t n_samples, float missing, EncAccessor&& acc)
       : batch_{std::move(batch)},
@@ -273,12 +283,12 @@ __device__ auto GetLeafWeight(bst_idx_t ridx, tree::MultiTargetTreeView const& t
 }
 
 template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
-__global__ void PredictKernel(Data data, common::Span<tree::MultiTargetTreeView> trees,
-                              bool use_shared, float missing, linalg::MatrixView<float> d_out_predt,
+__global__ void PredictKernel(Data data, bst_feature_t n_features,
+                              common::Span<tree::MultiTargetTreeView> trees, bool use_shared,
+                              float missing, linalg::MatrixView<float> d_out_predt,
                               EncAccessor acc) {
   for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), data.NumRows())) {
-    Loader loader{std::move(data), use_shared, static_cast<bst_feature_t>(data.NumCols()),
-                  data.NumRows(),  missing,    std::move(acc)};
+    Loader loader{std::move(data), use_shared, n_features, data.NumRows(), missing, std::move(acc)};
     for (auto const& tree : trees) {
       auto leaf = GetLeafWeight<has_missing>(idx, tree, &loader);
       for (std::size_t i = 0, n = leaf.Shape(0); i < n; ++i) {
@@ -407,6 +417,7 @@ class DeviceModel {
   bst_tree_t tree_end_;  // NOLINT
   int num_group;
   CatContainer const* cat_enc{nullptr};
+  bst_feature_t n_features{0};
 
   [[nodiscard]] std::size_t MemCostBytes() const {
     std::size_t n_bytes = 0;
@@ -495,6 +506,7 @@ class DeviceModel {
 
     this->cat_enc = model.Cats();
     CHECK(this->cat_enc);
+    this->n_features = model.learner_model_param->num_feature;
 
     auto n_bytes = this->MemCostBytes();  // Pull data to device, and get the size of the model.
     LOG(DEBUG) << "Model size:" << common::HumanMemUnit(n_bytes);
@@ -543,7 +555,7 @@ struct ShapSplitCondition {
       return l;
     }
     if (l.Capacity() > r.Capacity()) {
-      thrust::swap(l, r);
+      cuda::std::swap(l, r);
     }
     for (size_t i = 0; i < r.Bits().size(); ++i) {
       l.Bits()[i] &= r.Bits()[i];
@@ -581,8 +593,8 @@ void ExtractPaths(Context const* ctx,
   dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
   auto d_nodes = device_model.nodes.ConstDeviceSpan();
   auto d_tree_segments = device_model.tree_segments.ConstDeviceSpan();
-  auto nodes_transform = dh::MakeTransformIterator<PathInfo>(
-      thrust::make_counting_iterator(0ull), [=] __device__(size_t idx) {
+  auto nodes_transform = dh::MakeIndexTransformIter(
+      cuda::proclaim_return_type<PathInfo>([=] __device__(size_t idx) -> PathInfo {
         auto n = d_nodes[idx];
         if (!n.IsLeaf() || n.IsDeleted()) {
           return PathInfo{-1, 0, 0};
@@ -595,14 +607,16 @@ void ExtractPaths(Context const* ctx,
           path_length++;
         }
         return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
-      });
+      }));
   auto end = thrust::copy_if(ctx->CUDACtx()->CTP(), nodes_transform,
                              nodes_transform + d_nodes.size(), info.begin(),
-                             [=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
+                             cuda::proclaim_return_type<bool>([=] __device__(const PathInfo& e) {
+                               return e.leaf_position != -1;
+                             }));
   info.resize(end - info.begin());
   auto length_iterator = dh::MakeTransformIterator<size_t>(
-      info.begin(),
-      [=] __device__(const PathInfo& info) { return info.length; });
+      info.begin(), cuda::proclaim_return_type<decltype(std::declval<PathInfo>().length)>(
+                        [=] __device__(const PathInfo& info) { return info.length; }));
   dh::caching_device_vector<size_t> path_segments(info.size() + 1);
   thrust::exclusive_scan(ctx->CUDACtx()->CTP(), length_iterator, length_iterator + info.size() + 1,
                          path_segments.begin());
@@ -619,7 +633,7 @@ void ExtractPaths(Context const* ctx,
   auto d_cat_segments = device_model.categories_tree_segments.ConstDeviceSpan();
   auto d_cat_node_segments = device_model.categories_node_segments.ConstDeviceSpan();
 
-  size_t max_cat = 0;
+  std::size_t max_cat = 0;
   if (thrust::any_of(ctx->CUDACtx()->CTP(), dh::tbegin(d_split_types), dh::tend(d_split_types),
                      common::IsCatOp{})) {
     dh::PinnedMemory pinned;
@@ -871,10 +885,7 @@ class ColumnSplitHelper {
       BitVector decision_bits{dh::ToSpan(decision_storage)};
       BitVector missing_bits{dh::ToSpan(missing_storage)};
 
-      batch.offset.SetDevice(ctx_->Device());
-      batch.data.SetDevice(ctx_->Device());
-      SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
-
+      SparsePageView data{ctx_, batch, num_features};
       auto const grid = static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
       dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
           MaskBitVectorKernel, data, model.nodes.ConstDeviceSpan(),
@@ -935,7 +946,10 @@ class ColumnSplitHelper {
 using cuda_impl::MakeCatAccessor;
 
 template <typename EncAccessor>
-struct ShapSparsePageView {
+struct ShapSparsePageLoader {
+ public:
+  using SupportShmemLoad = std::false_type;
+
   SparsePageView data;
   EncAccessor acc;
 
@@ -948,51 +962,77 @@ struct ShapSparsePageView {
   [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return data.NumCols(); }
 };
 
-template <typename Kernel>
-void LaunchPredictKernel(Context const* ctx, bool is_dense, enc::DeviceColumnsView const& new_enc,
-                         DeviceModel const& model, Kernel&& launch) {
-  if (is_dense) {
-    auto is_dense = std::true_type{};
-    if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
-      launch(is_dense, std::move(acc));
-    } else {
-      launch(is_dense, NoOpAccessor{});
-    }
-  } else {
-    auto is_dense = std::false_type{};
-    if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
-      launch(is_dense, std::move(acc));
-    } else {
-      launch(is_dense, NoOpAccessor{});
-    }
-  }
-}
-
 // Provide configuration for launching the predict kernel.
-template <std::uint32_t kBlockThreads = 128, bool kUseShared = true>
+template <typename IsDense, typename EncAccessor>
 class LaunchConfig {
+ public:
+  static constexpr bool HasMissing() { return !IsDense::value; }
+  using EncAccessorT = EncAccessor;
+
+  template <typename T, std::uint32_t block_threads>
+  struct LoaderType {
+    using Type = T;
+    constexpr static std::uint32_t kBlockThreads = block_threads;
+
+    static std::size_t AllocShmem(Context const* ctx, bst_feature_t n_features) {
+      if constexpr (typename Type::SupportShmemLoad{}) {
+        return SharedMemoryBytes<kBlockThreads>(n_features, ConfigureDevice(ctx->Device()));
+      }
+      return 0;
+    }
+  };
+
  private:
   static auto constexpr NotSet() { return std::numeric_limits<bst_idx_t>::max(); }
 
   Context const* ctx_;
-  std::size_t const shared_memory_bytes_;
-  bst_idx_t n_samples_{NotSet()};
-
-  template <typename K, typename... Args>
-  void LaunchImpl(K&& kernel, Args&&... args) const&& {
-    CHECK_NE(this->n_samples_, NotSet());
-    auto grid = static_cast<uint32_t>(common::DivRoundUp(this->n_samples_, kBlockThreads));
-    dh::LaunchKernel{grid, kBlockThreads, this->shared_memory_bytes_,  // NOLINT
-                     this->ctx_->CUDACtx()->Stream()}(kernel, std::forward<Args>(args)...);
+  bst_feature_t n_features_;
+  std::size_t shared_memory_bytes_{0};
+
+ public:
+  template <typename Loader, typename K, typename BatchT, typename... Args>
+  void Launch(K&& kernel, BatchT&& batch, Args&&... args) const {
+    auto grid = static_cast<uint32_t>(common::DivRoundUp(batch.NumRows(), Loader::kBlockThreads));
+    dh::LaunchKernel{grid, Loader::kBlockThreads, this->shared_memory_bytes_,  // NOLINT
+                     this->ctx_->CUDACtx()->Stream()}(kernel, std::forward<BatchT>(batch),
+                                                      std::forward<Args>(args)...);
+  }
+  template <typename Loader, typename Data>
+  void LaunchPredictKernel(Data batch, float missing, bst_feature_t n_features,
+                           DeviceModel const& d_model, EncAccessorT acc, bst_idx_t batch_offset,
+                           HostDeviceVector<float>* predictions) {
+    auto kernel = PredictKernel<typename Loader::Type, common::GetValueT<decltype(batch)>,
+                                HasMissing(), EncAccessorT>;
+    this->Launch<Loader>(
+        kernel, std::move(batch), d_model.nodes.ConstDeviceSpan(),
+        predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
+
+        d_model.tree_group.ConstDeviceSpan(),
+
+        d_model.split_types.ConstDeviceSpan(), d_model.categories_tree_segments.ConstDeviceSpan(),
+        d_model.categories_node_segments.ConstDeviceSpan(), d_model.categories.ConstDeviceSpan(),
+
+        d_model.tree_beg_, d_model.tree_end_, n_features, batch.NumRows(), this->UseShared(),
+        d_model.num_group, missing, acc);
   }
 
-  [[nodiscard]] LaunchConfig Grid(bst_idx_t n_samples) const {
-    LaunchConfig cfg = *this;
-    cfg.n_samples_ = n_samples;
-    return cfg;
+  template <typename Loader, typename Data>
+  void LaunchMultiPredictKernel(Data batch, gbm::GBTreeModel const& model, bst_tree_t tree_begin,
+                                bst_tree_t tree_end, EncAccessorT acc, bst_idx_t batch_offset,
+                                HostDeviceVector<float>* predictions) {
+    CHECK_EQ(batch_offset, 0);  // external memory is not supported yet.
+    std::vector<tree::MultiTargetTreeView> h_trees;
+    for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
+      h_trees.emplace_back(ctx_, model.trees[tree_idx].get());
+    }
+    dh::device_vector<tree::MultiTargetTreeView> trees = h_trees;
+    auto kernel = multi::PredictKernel<typename Loader::Type, Data, true, EncAccessorT>;
+    auto predt =
+        linalg::MakeTensorView(ctx_, predictions, batch.NumRows(), h_trees.front().NumTargets());
+    this->Launch<Loader>(kernel, std::move(batch), this->n_features_, dh::ToSpan(trees),
+                         this->UseShared(), std::numeric_limits<float>::quiet_NaN(), predt, acc);
   }
+
   [[nodiscard]] bool UseShared() const { return shared_memory_bytes_ != 0; }
 
   [[nodiscard]] static std::size_t ConfigureDevice(DeviceOrd const& device) {
@@ -1005,85 +1045,107 @@ class LaunchConfig {
     return it->second;
   }
 
+  template <typename Loader>
+  void AllocShmem() {
+    this->shared_memory_bytes_ = Loader::AllocShmem(this->ctx_, this->n_features_);
+  }
+
  public:
   LaunchConfig(Context const* ctx, bst_feature_t n_features)
-      : ctx_{ctx},
-        shared_memory_bytes_{kUseShared ? SharedMemoryBytes<kBlockThreads>(
-                                              n_features, ConfigureDevice(ctx->Device()))
-                                        : 0} {}
-
-  template <template <typename> typename Loader, typename Data>
-  void LaunchPredict(Context const* ctx, Data data, float missing, bst_idx_t n_samples,
-                     bst_feature_t n_features, DeviceModel const& model, bool is_dense,
-                     enc::DeviceColumnsView const& new_enc, bst_idx_t batch_offset,
-                     HostDeviceVector<float>* predictions) const {
-    LaunchPredictKernel(ctx, is_dense, new_enc, model, [&](auto is_dense, auto&& acc) {
-      constexpr bool kHasMissing = !std::is_same_v<decltype(is_dense), std::true_type>;
-      using EncAccessor = std::remove_reference_t<decltype(acc)>;
-      auto kernel = PredictKernel<Loader<EncAccessor>, Data, kHasMissing, EncAccessor>;
-      this->Grid(n_samples).LaunchImpl(
-          std::move(kernel), std::move(data), model.nodes.ConstDeviceSpan(),
-          predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
-
-          model.tree_group.ConstDeviceSpan(),
+      : ctx_{ctx}, n_features_{n_features} {}
 
-          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
-          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
+  template <typename Fn>
+  void ForEachBatch(DMatrix* p_fmat, Fn&& fn) {
+    if (p_fmat->PageExists<SparsePage>()) {
+      constexpr std::uint32_t kBlockThreads = 128;
+      using LoaderImpl = SparsePageLoader<EncAccessor>;
+      using Loader = LoaderType<LoaderImpl, kBlockThreads>;
+      this->AllocShmem<Loader>();
+      for (auto& page : p_fmat->GetBatches<SparsePage>()) {
+        SparsePageView batch{ctx_, page, n_features_};
+        fn(Loader{}, std::forward<SparsePageView>(batch));
+      }
+    } else {
+      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
+      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
 
-          model.tree_beg_, model.tree_end_, n_features, n_samples, this->UseShared(),
-          model.num_group, missing, std::forward<EncAccessor>(acc));
-    });
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+        page.Impl()->Visit(ctx_, feature_types, [&](auto&& batch) {
+          using Acc = std::remove_reference_t<decltype(batch)>;
+          // No shared memory use for ellpack
+          using Loader = EllpackLoader<Acc, EncAccessor>;
+          constexpr std::uint32_t kBlockThreads = 256;
+          fn(LoaderType<Loader, kBlockThreads>{},
+             std::forward<common::GetValueT<decltype(batch)>>(batch));
+        });
+      }
+    }
   }
+  // Used by the SHAP methods.
+  template <typename Fn>
+  void ForEachBatch(DMatrix* p_fmat, EncAccessor&& acc, Fn&& fn) {
+    if (p_fmat->PageExists<SparsePage>()) {
+      for (auto& page : p_fmat->GetBatches<SparsePage>()) {
+        // Shap kernel doesn't use shared memory to stage data.
+        SparsePageView batch{ctx_, page, n_features_};
+        auto loader = ShapSparsePageLoader<EncAccessor>{batch, acc};
+        fn(std::move(loader), page.base_rowid);
+      }
+    } else {
+      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
+      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
 
-  template <template <typename> typename Loader, typename Data>
-  void LaunchMultiPredict(Context const* ctx, Data data, gbm::GBTreeModel const& model,
-                          float missing, bst_tree_t tree_begin, bst_tree_t tree_end,
-                          bst_idx_t batch_offset, HostDeviceVector<float>* predictions) const {
-    CHECK_EQ(batch_offset, 0);  // External memory is not supported yet.
-    CHECK_GT(tree_end, tree_begin);
-    std::vector<tree::MultiTargetTreeView> h_trees;
-    for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      h_trees.emplace_back(ctx, model.trees[tree_idx].get());
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+        page.Impl()->Visit(ctx_, feature_types, [&](auto&& batch) {
+          using Acc = std::remove_reference_t<decltype(batch)>;
+          // No shared memory use for ellpack
+          auto loader = EllpackLoader{batch,
+                                      /*use_shared=*/false,
+                                      this->n_features_,
+                                      batch.NumRows(),
+                                      std::numeric_limits<float>::quiet_NaN(),
+                                      std::forward<EncAccessor>(acc)};
+          fn(std::move(loader), batch.base_rowid);
+        });
+      }
     }
-    dh::device_vector<tree::MultiTargetTreeView> trees = h_trees;
-    CHECK_GE(predictions->Size(), data.NumRows() * h_trees.front().NumTargets());
-    auto kernel = multi::PredictKernel<Loader<NoOpAccessor>, Data, true, NoOpAccessor>;
-    auto predt =
-        linalg::MakeTensorView(ctx, predictions, data.NumRows(), h_trees.front().NumTargets());
-    this->Grid(data.NumRows())
-        .LaunchImpl(std::move(kernel), std::move(data), dh::ToSpan(trees), this->UseShared(),
-                    missing, predt, NoOpAccessor{});
   }
+};
 
-  template <template <typename> typename Loader, typename Data>
-  void LaunchLeaf(Context const* ctx, Data data, bst_idx_t n_samples, bst_feature_t n_features,
-                  DeviceModel const& model, bool is_dense, enc::DeviceColumnsView const& new_enc,
-                  bst_idx_t batch_offset, HostDeviceVector<float>* predictions) const {
-    LaunchPredictKernel(ctx, is_dense, new_enc, model, [&](auto is_dense, auto&& acc) {
-      constexpr bool kHasMissing = !std::is_same_v<decltype(is_dense), std::true_type>;
-      using EncAccessor = std::remove_reference_t<decltype(acc)>;
-      auto kernel = PredictLeafKernel<Loader<EncAccessor>, Data, kHasMissing, EncAccessor>;
-      this->Grid(n_samples).LaunchImpl(
-          std::move(kernel), std::move(data), model.nodes.ConstDeviceSpan(),
-          predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
-
-          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
-          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
-
-          model.tree_beg_, model.tree_end_, n_features, n_samples, this->UseShared(),
-          std::numeric_limits<float>::quiet_NaN(), std::forward<EncAccessor>(acc));
-    });
+template <typename Kernel>
+void LaunchPredict(Context const* ctx, bool is_dense, enc::DeviceColumnsView const& new_enc,
+                   DeviceModel const& model, Kernel&& launch) {
+  if (is_dense) {
+    if (model.cat_enc && model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
+      auto cfg = LaunchConfig<std::true_type, decltype(acc)>{ctx, model.n_features};
+      launch(std::move(cfg), std::move(acc));
+    } else {
+      auto cfg = LaunchConfig<std::true_type, NoOpAccessor>{ctx, model.n_features};
+      launch(std::move(cfg), NoOpAccessor{});
+    }
+  } else {
+    if (model.cat_enc && model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
+      auto cfg = LaunchConfig<std::false_type, decltype(acc)>{ctx, model.n_features};
+      launch(std::move(cfg), std::move(acc));
+    } else {
+      auto cfg = LaunchConfig<std::false_type, NoOpAccessor>{ctx, model.n_features};
+      launch(std::move(cfg), NoOpAccessor{});
+    }
   }
-};
+}
 
 template <typename Kernel>
-void LaunchShapKernel(Context const* ctx, enc::DeviceColumnsView const& new_enc,
-                      DeviceModel const& model, Kernel launch) {
+void LaunchShap(Context const* ctx, enc::DeviceColumnsView const& new_enc, DeviceModel const& model,
+                Kernel launch) {
   if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
     auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
-    launch(std::move(acc));
+    auto cfg = LaunchConfig<std::true_type, decltype(acc)>{ctx, model.n_features};
+    launch(std::move(cfg), std::move(acc));
   } else {
-    launch(NoOpAccessor{});
+    auto cfg = LaunchConfig<std::true_type, NoOpAccessor>{ctx, model.n_features};
+    launch(std::move(cfg), NoOpAccessor{});
   }
 }
 }  // anonymous namespace
@@ -1098,6 +1160,7 @@ class GPUPredictor : public xgboost::Predictor {
     }
     out_preds->SetDevice(ctx_->Device());
     auto const& info = p_fmat->Info();
+
     DeviceModel d_model;
     if (!model.trees[tree_begin]->IsMultiTarget()) {
       d_model.Init(model, tree_begin, tree_end, ctx_->Device());
@@ -1109,46 +1172,27 @@ class GPUPredictor : public xgboost::Predictor {
     }
 
     CHECK_LE(p_fmat->Info().num_col_, model.learner_model_param->num_feature);
+    auto n_features = model.learner_model_param->num_feature;
+
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
+    LaunchPredict(ctx_, p_fmat->IsDense(), new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+      using Config = common::GetValueT<decltype(cfg)>;
 
-    if (p_fmat->PageExists<SparsePage>()) {
       bst_idx_t batch_offset = 0;
-      for (auto& page : p_fmat->GetBatches<SparsePage>()) {
-        page.offset.SetDevice(ctx_->Device());
-        page.data.SetDevice(ctx_->Device());
-        auto n_features = model.learner_model_param->num_feature;
-        LaunchConfig cfg{ctx_, n_features};
-        SparsePageView data(page.data.DeviceSpan(), page.offset.DeviceSpan(), n_features);
+      cfg.ForEachBatch(p_fmat, [&](auto&& loader_t, auto&& batch) {
+        using Loader = typename common::GetValueT<decltype(loader_t)>;
         if (model.trees[tree_begin]->IsMultiTarget()) {
-          cfg.LaunchMultiPredict<SparsePageLoader>(this->ctx_, std::move(data), model,
-                                                   std::numeric_limits<float>::quiet_NaN(),
-                                                   tree_begin, tree_end, batch_offset, out_preds);
+          cfg.template LaunchMultiPredictKernel<Loader>(std::move(batch), model, tree_begin,
+                                                        tree_end, acc, batch_offset, out_preds);
         } else {
-          cfg.LaunchPredict<SparsePageLoader>(
-              this->ctx_, std::move(data), std::numeric_limits<float>::quiet_NaN(), page.Size(),
-              n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
+          cfg.template LaunchPredictKernel<Loader>(
+              std::move(batch), std::numeric_limits<float>::quiet_NaN(), n_features, d_model, acc,
+              batch_offset, out_preds);
         }
-        batch_offset += page.Size() * model.learner_model_param->OutputLength();
-      }
-    } else {
-      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
-      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
-
-      bst_idx_t batch_offset = 0;
-      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& batch) {
-          using Acc = std::remove_reference_t<decltype(batch)>;
-          // No shared memory use for ellpack
-          bst_feature_t n_features = batch.NumFeatures();
-          LaunchConfig<256, false> cfg{this->ctx_, n_features};
-          cfg.LaunchPredict<EllpackPartial<Acc>::template Type>(
-              this->ctx_, std::move(batch), std::numeric_limits<float>::quiet_NaN(), page.Size(),
-              n_features, d_model, p_fmat->IsDense(), new_enc, batch_offset, out_preds);
-        });
-        batch_offset += page.Size() * model.learner_model_param->OutputLength();
-      }
-    }
+        batch_offset += batch.NumRows() * model.learner_model_param->OutputLength();
+      });
+    });
   }
 
  public:
@@ -1170,13 +1214,6 @@ class GPUPredictor : public xgboost::Predictor {
     this->PredictDMatrix(dmat, out_preds, model, tree_begin, tree_end);
   }
 
-  // Fill the `BatchT` parameter, currying for template.
-  template <typename BatchT>
-  struct PartialLoader {
-    template <typename T>
-    using Type = DeviceAdapterLoader<BatchT, T>;
-  };
-
   template <typename Adapter>
   void DispatchedInplacePredict(std::shared_ptr<Adapter> m, std::shared_ptr<DMatrix> p_m,
                                 const gbm::GBTreeModel& model, float missing,
@@ -1191,7 +1228,6 @@ class GPUPredictor : public xgboost::Predictor {
 
     auto n_samples = m->NumRows();
     auto n_features = model.learner_model_param->num_feature;
-    LaunchConfig cfg{ctx_, n_features};
 
     DeviceModel d_model;
     d_model.Init(model, tree_begin, tree_end, m->Device());
@@ -1199,15 +1235,31 @@ class GPUPredictor : public xgboost::Predictor {
     if constexpr (std::is_same_v<Adapter, data::CudfAdapter>) {
       if (m->HasCategorical()) {
         auto new_enc = m->DCats();
-        cfg.LaunchPredict<PartialLoader<BatchT>::template Type>(
-            this->ctx_, m->Value(), missing, n_samples, n_features, d_model, false, new_enc, 0,
-            &out_preds->predictions);
+        LaunchPredict(this->ctx_, false, new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+          using EncAccessor = std::remove_reference_t<decltype(acc)>;
+          using LoaderImpl = DeviceAdapterLoader<BatchT, EncAccessor>;
+          using Loader =
+              typename common::GetValueT<decltype(cfg)>::template LoaderType<LoaderImpl, 128>;
+          cfg.template AllocShmem<Loader>();
+          cfg.template LaunchPredictKernel<Loader>(m->Value(), missing, n_features, d_model, acc, 0,
+                                                   &out_preds->predictions);
+        });
         return;
       }
     }
-    cfg.LaunchPredict<PartialLoader<BatchT>::template Type>(
-        this->ctx_, m->Value(), missing, n_samples, n_features, d_model, false,
-        enc::DeviceColumnsView{}, 0, &out_preds->predictions);
+
+    LaunchPredict(this->ctx_, false, enc::DeviceColumnsView{}, d_model,
+                  [&](auto&& cfg, auto&& acc) {
+                    using EncAccessor = std::remove_reference_t<decltype(acc)>;
+                    CHECK((std::is_same_v<EncAccessor, NoOpAccessor>));
+                    using LoaderImpl = DeviceAdapterLoader<BatchT, EncAccessor>;
+                    using Loader =
+                        typename common::GetValueT<decltype(cfg)>::template LoaderType<LoaderImpl,
+                                                                                       128>;
+                    cfg.template AllocShmem<Loader>();
+                    cfg.template LaunchPredictKernel<Loader>(
+                        m->Value(), missing, n_features, d_model, acc, 0, &out_preds->predictions);
+                  });
   }
 
   [[nodiscard]] bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const& model,
@@ -1262,41 +1314,17 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<uint32_t> categories;
     ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
 
-    if (p_fmat->PageExists<SparsePage>()) {
-      for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-        auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
-        LaunchShapKernel(this->ctx_, new_enc, d_model, [&](auto&& acc) {
-          batch.data.SetDevice(ctx_->Device());
-          batch.offset.SetDevice(ctx_->Device());
-          SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                           model.learner_model_param->num_feature);
-          using EncAccessor = std::remove_reference_t<decltype(acc)>;
-          auto loader = ShapSparsePageView<EncAccessor>{X, std::forward<EncAccessor>(acc)};
-          gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
-              loader, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
-        });
-      }
-    } else {
-      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
-      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
+    LaunchShap(this->ctx_, new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+      using Config = common::GetValueT<decltype(cfg)>;
+      using EncAccessor = typename Config::EncAccessorT;
 
-      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& ellpack) {
-          auto begin = dh::tbegin(phis) + page.BaseRowId() * dim_size;
-          LaunchShapKernel(this->ctx_, new_enc, d_model, [&](auto&& acc) {
-            using EncAccessor = std::remove_reference_t<decltype(acc)>;
-            auto X = EllpackLoader{ellpack,
-                                   true,
-                                   model.learner_model_param->num_feature,
-                                   page.Size(),
-                                   std::numeric_limits<float>::quiet_NaN(),
-                                   std::forward<EncAccessor>(acc)};
+      cfg.ForEachBatch(
+          p_fmat, std::forward<EncAccessor>(acc), [&](auto&& loader, bst_idx_t base_rowid) {
+            auto begin = dh::tbegin(phis) + base_rowid * dim_size;
             gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
-                X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+                loader, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
           });
-        });
-      }
-    }
+    });
 
     // Add the base margin term to last column
     p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
@@ -1344,43 +1372,17 @@ class GPUPredictor : public xgboost::Predictor {
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
 
-    if (p_fmat->PageExists<SparsePage>()) {
-      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
-        auto launch = [&](auto&& acc) {
-          batch.data.SetDevice(ctx_->Device());
-          batch.offset.SetDevice(ctx_->Device());
-          SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                           model.learner_model_param->num_feature);
-          using EncAccessor = std::remove_reference_t<decltype(acc)>;
-          auto loader = ShapSparsePageView<EncAccessor>{X, std::forward<EncAccessor>(acc)};
-          gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
-              loader, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
-        };
-        LaunchShapKernel(this->ctx_, new_enc, d_model, launch);
-      }
-    } else {
-      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
-      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
+    LaunchShap(this->ctx_, new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+      using Config = common::GetValueT<decltype(cfg)>;
+      using EncAccessor = typename Config::EncAccessorT;
 
-      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& ellpack) {
-          auto begin = dh::tbegin(phis) + page.BaseRowId() * dim_size;
-          auto launch = [&](auto&& acc) {
-            using EncAccessor = std::remove_reference_t<decltype(acc)>;
-            auto X = EllpackLoader{ellpack,
-                                   /*use_shared=*/false,
-                                   model.learner_model_param->num_feature,
-                                   page.Size(),
-                                   std::numeric_limits<float>::quiet_NaN(),
-                                   std::forward<EncAccessor>(acc)};
+      cfg.ForEachBatch(
+          p_fmat, std::forward<EncAccessor>(acc), [&](auto&& loader, bst_idx_t base_rowid) {
+            auto begin = dh::tbegin(phis) + base_rowid * dim_size;
             gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
-                X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
-          };
-          LaunchShapKernel(this->ctx_, new_enc, d_model, launch);
-        });
-      }
-    }
+                loader, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+          });
+    });
 
     // Add the base margin term to last column
     p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
@@ -1418,35 +1420,32 @@ class GPUPredictor : public xgboost::Predictor {
     bst_feature_t n_features = info.num_col_;
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
-    LaunchConfig cfg{this->ctx_, n_features};
 
-    if (p_fmat->PageExists<SparsePage>()) {
+    LaunchPredict(ctx_, p_fmat->IsDense(), new_enc, d_model, [&](auto&& cfg, auto&& acc) {
       bst_idx_t batch_offset = 0;
-      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->Device());
-        batch.offset.SetDevice(ctx_->Device());
-        SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                            model.learner_model_param->num_feature};
-        cfg.LaunchLeaf<SparsePageLoader>(this->ctx_, std::move(data), batch.Size(), n_features,
-                                         d_model, p_fmat->IsDense(), new_enc, batch_offset,
-                                         predictions);
-        batch_offset += batch.Size();
-      }
-    } else {
-      p_fmat->Info().feature_types.SetDevice(ctx_->Device());
-      auto feature_types = p_fmat->Info().feature_types.ConstDeviceSpan();
-
-      bst_idx_t batch_offset = 0;
-      for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-        page.Impl()->Visit(this->ctx_, feature_types, [&](auto&& batch) {
-          using Acc = std::remove_reference_t<decltype(batch)>;
-          cfg.LaunchLeaf<EllpackPartial<Acc>::template Type>(
-              this->ctx_, std::forward<Acc>(batch), page.Size(), n_features, d_model,
-              p_fmat->IsDense(), new_enc, batch_offset, predictions);
-        });
-        batch_offset += page.Size();
-      }
-    }
+      cfg.ForEachBatch(p_fmat, [&](auto&& loader_t, auto&& batch) {
+        using Loader = typename common::GetValueT<decltype(loader_t)>;
+        using Config = common::GetValueT<decltype(cfg)>;
+        auto kernel = PredictLeafKernel<typename Loader::Type, common::GetValueT<decltype(batch)>,
+                                        Config::HasMissing(), typename Config::EncAccessorT>;
+
+        cfg.template Launch<Loader>(kernel, std::move(batch), d_model.nodes.ConstDeviceSpan(),
+                                    predictions->DeviceSpan().subspan(batch_offset),
+                                    d_model.tree_segments.ConstDeviceSpan(),
+
+                                    d_model.split_types.ConstDeviceSpan(),
+                                    d_model.categories_tree_segments.ConstDeviceSpan(),
+                                    d_model.categories_node_segments.ConstDeviceSpan(),
+                                    d_model.categories.ConstDeviceSpan(),
+
+                                    d_model.tree_beg_, d_model.tree_end_, n_features,
+                                    batch.NumRows(), cfg.UseShared(),
+                                    std::numeric_limits<float>::quiet_NaN(),
+                                    std::forward<typename Config::EncAccessorT>(acc));
+
+        batch_offset += batch.NumRows();
+      });
+    });
   }
 
  private:

From b12f637e39475b0fd4b9262515e9cc936cf2e70c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 14 Oct 2025 04:34:45 +0800
Subject: [PATCH 194/224] [mt] Implement model dump for all formats. (#11747)

- Use the new tree view.
- Remove the dmlc registry.
---
 src/tree/tree_model.cc                        | 409 ++++++++----------
 src/tree/tree_view.h                          |  20 +-
 .../cpp/tree/test_multi_target_tree_model.cc  |  25 +-
 3 files changed, 223 insertions(+), 231 deletions(-)

diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 52979be1ec48..96305ae9a64c 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -41,7 +41,9 @@ std::enable_if_t<std::is_floating_point_v<Float>, std::string> ToStr(Float value
 }
 
 template <typename Float>
-std::string ToStr(linalg::VectorView<Float> value, bst_target_t limit) {
+std::string ToStr(linalg::VectorView<Float> value) {
+  // Hardcoded limit to avoid dumping long arrays into dot graph.
+  constexpr bst_target_t kLimit = 3;
   int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
   static_assert(std::is_floating_point_v<Float>,
                 "Use std::to_string instead for non-floating point values.");
@@ -51,22 +53,24 @@ std::string ToStr(linalg::VectorView<Float> value, bst_target_t limit) {
     ss << value(0);
     return ss.str();
   }
-  CHECK_GE(limit, 2);
-  auto n = std::min(static_cast<bst_target_t>(value.Size() - 1), limit - 1);
+  CHECK_GE(kLimit, 2);
+  auto n = std::min(static_cast<bst_target_t>(value.Size() - 1), kLimit - 1);
   ss << "[";
   for (std::size_t i = 0; i < n; ++i) {
     ss << value(i) << ", ";
   }
-  if (value.Size() > limit) {
+  if (value.Size() > kLimit) {
     ss << "..., ";
   }
   ss << value(value.Size() - 1) << "]";
   return ss.str();
 }
 }  // namespace
-/*!
- * \brief Base class for dump model implementation, modeling closely after code generator.
+
+/**
+ * @brief Base class for dump model implementation.
  */
+template <typename TreeView>
 class TreeGenerator {
  protected:
   FeatureMap const& fmap_;
@@ -94,30 +98,30 @@ class TreeGenerator {
     return result;
   }
 
-  virtual std::string Indicator(RegTree const& /*tree*/,
+  virtual std::string Indicator(TreeView /*tree*/,
                                 int32_t /*nid*/, uint32_t /*depth*/) const {
     return "";
   }
-  virtual std::string Categorical(RegTree const&, int32_t, uint32_t) const = 0;
-  virtual std::string Integer(RegTree const& /*tree*/,
+  virtual std::string Categorical(TreeView, int32_t, uint32_t) const = 0;
+  virtual std::string Integer(TreeView /*tree*/,
                                 int32_t /*nid*/, uint32_t /*depth*/) const {
     return "";
   }
-  virtual std::string Quantitive(RegTree const& /*tree*/,
+  virtual std::string Quantitive(TreeView /*tree*/,
                                 int32_t /*nid*/, uint32_t /*depth*/) const {
     return "";
   }
-  virtual std::string NodeStat(RegTree const& /*tree*/, int32_t /*nid*/) const {
+  virtual std::string NodeStat(TreeView /*tree*/, int32_t /*nid*/) const {
     return "";
   }
 
-  virtual std::string PlainNode(RegTree const& /*tree*/,
+  virtual std::string PlainNode(TreeView /*tree*/,
                                 int32_t /*nid*/, uint32_t /*depth*/) const = 0;
 
-  virtual std::string SplitNode(RegTree const& tree, int32_t nid, uint32_t depth) {
-    auto const split_index = tree[nid].SplitIndex();
+  virtual std::string SplitNode(TreeView tree, int32_t nid, uint32_t depth) {
+    auto const split_index = tree.SplitIndex(nid);
     std::string result;
-    auto is_categorical = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto is_categorical = tree.SplitType(nid) == FeatureType::kCategorical;
     if (split_index < fmap_.Size()) {
       auto check_categorical = [&]() {
         CHECK(is_categorical)
@@ -166,71 +170,29 @@ class TreeGenerator {
     return result;
   }
 
-  virtual std::string LeafNode(RegTree const& tree, int32_t nid, uint32_t depth) const = 0;
-  virtual std::string BuildTree(RegTree const& tree, int32_t nid, uint32_t depth) = 0;
+  virtual std::string LeafNode(TreeView tree, int32_t nid, uint32_t depth) const = 0;
+  virtual std::string BuildTree(TreeView tree, int32_t nid, uint32_t depth) = 0;
 
  public:
   TreeGenerator(FeatureMap const& _fmap, bool with_stats) :
       fmap_{_fmap}, with_stats_{with_stats} {}
   virtual ~TreeGenerator() = default;
 
-  virtual void BuildTree(RegTree const& tree) {
+  virtual void BuildTree(TreeView tree) {
     ss_ << this->BuildTree(tree, 0, 0);
   }
 
   std::string Str() const {
     return ss_.str();
   }
-
-  static TreeGenerator* Create(std::string const& attrs, FeatureMap const& fmap,
-                               bool with_stats);
-};
-
-struct TreeGenReg : public dmlc::FunctionRegEntryBase<
-  TreeGenReg,
-  std::function<TreeGenerator* (
-      FeatureMap const& fmap, std::string attrs, bool with_stats)> > {
 };
 }  // namespace xgboost
 
 
-namespace dmlc {
-DMLC_REGISTRY_ENABLE(::xgboost::TreeGenReg);
-}  // namespace dmlc
-
 namespace xgboost {
-
-TreeGenerator* TreeGenerator::Create(std::string const& attrs, FeatureMap const& fmap,
-                                     bool with_stats) {
-  auto pos = attrs.find(':');
-  std::string name;
-  std::string params;
-  if (pos != std::string::npos) {
-    name = attrs.substr(0, pos);
-    params = attrs.substr(pos+1, attrs.length() - pos - 1);
-    // Eliminate all occurrences of single quote string.
-    size_t pos = std::string::npos;
-    while ((pos = params.find('\'')) != std::string::npos) {
-      params.replace(pos, 1, "\"");
-    }
-  } else {
-    name = attrs;
-  }
-  auto *e = ::dmlc::Registry< ::xgboost::TreeGenReg>::Get()->Find(name);
-  if (e == nullptr) {
-    LOG(FATAL) << "Unknown Model Builder:" << name;
-  }
-  auto p_io_builder = (e->body)(fmap, params, with_stats);
-  return p_io_builder;
-}
-
-#define XGBOOST_REGISTER_TREE_IO(UniqueId, Name)                        \
-  static DMLC_ATTRIBUTE_UNUSED ::xgboost::TreeGenReg&                   \
-  __make_ ## TreeGenReg ## _ ## UniqueId ## __ =                        \
-                  ::dmlc::Registry< ::xgboost::TreeGenReg>::Get()->__REGISTER__(Name)
-
 namespace {
-std::vector<bst_cat_t> GetSplitCategories(RegTree const& tree, int32_t nidx) {
+template <typename TreeView>
+std::vector<bst_cat_t> GetSplitCategories(TreeView const& tree, int32_t nidx) {
   auto const& csr = tree.GetCategoriesMatrix();
   auto seg = csr.node_ptr[nidx];
   auto split = common::KCatBitField{csr.categories.subspan(seg.beg, seg.size)};
@@ -266,60 +228,59 @@ std::string GetFeatureName(FeatureMap const& fmap, bst_feature_t split_index) {
 }
 }  // anonymous namespace
 
-class TextGenerator : public TreeGenerator {
-  using SuperT = TreeGenerator;
+template <typename TreeView>
+class TextGenerator : public TreeGenerator<TreeView> {
+  using SuperT = TreeGenerator<TreeView>;
 
  public:
-  TextGenerator(FeatureMap const& fmap, bool with_stats) :
-      TreeGenerator(fmap, with_stats) {}
+  TextGenerator(FeatureMap const& fmap, bool with_stats) : SuperT(fmap, with_stats) {}
 
-  std::string LeafNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
+  std::string LeafNode(TreeView tree, int32_t nid, uint32_t depth) const override {
     static std::string kLeafTemplate = "{tabs}{nid}:leaf={leaf}{stats}";
     static std::string kStatTemplate = ",cover={cover}";
     std::string result = SuperT::Match(
         kLeafTemplate,
         {{"{tabs}",  SuperT::Tabs(depth)},
          {"{nid}",   std::to_string(nid)},
-         {"{leaf}",  ToStr(tree[nid].LeafValue())},
-         {"{stats}", with_stats_ ?
+         {"{leaf}",  ToStr(tree.LeafValue(nid))},
+         {"{stats}", SuperT::with_stats_ ?
           SuperT::Match(kStatTemplate,
-                        {{"{cover}", ToStr(tree.Stat(nid).sum_hess)}}) : ""}});
+                        {{"{cover}", ToStr(tree.SumHess(nid))}}) : ""}});
     return result;
   }
 
-  std::string Indicator(RegTree const& tree, int32_t nid, uint32_t) const override {
+  std::string Indicator(TreeView tree, bst_node_t nid, uint32_t) const override {
     static std::string const kIndicatorTemplate = "{nid}:[{fname}] yes={yes},no={no}";
-    int32_t nyes = tree[nid].DefaultLeft() ?
-                   tree[nid].RightChild() : tree[nid].LeftChild();
-    auto split_index = tree[nid].SplitIndex();
+    int32_t nyes = tree.DefaultLeft(nid) ? tree.RightChild(nid) : tree.LeftChild(nid);
+    auto split_index = tree.SplitIndex(nid);
     std::string result = SuperT::Match(
         kIndicatorTemplate,
         {{"{nid}",   std::to_string(nid)},
-         {"{fname}", GetFeatureName(fmap_, split_index)},
+         {"{fname}", GetFeatureName(SuperT::fmap_, split_index)},
          {"{yes}",   std::to_string(nyes)},
-         {"{no}",    std::to_string(tree[nid].DefaultChild())}});
+         {"{no}",    std::to_string(tree.DefaultChild(nid))}});
     return result;
   }
 
-  std::string SplitNodeImpl(RegTree const& tree, bst_node_t nid, std::string const& template_str,
+  std::string SplitNodeImpl(TreeView tree, bst_node_t nid, std::string const& template_str,
                             std::string cond, uint32_t depth) const {
-    auto split_index = tree[nid].SplitIndex();
+    auto split_index = tree.SplitIndex(nid);
     std::string const result = SuperT::Match(
         template_str,
         {{"{tabs}",    SuperT::Tabs(depth)},
          {"{nid}",     std::to_string(nid)},
-         {"{fname}",   GetFeatureName(fmap_, split_index)},
+         {"{fname}",   GetFeatureName(SuperT::fmap_, split_index)},
          {"{cond}",    cond},
-         {"{left}",    std::to_string(tree[nid].LeftChild())},
-         {"{right}",   std::to_string(tree[nid].RightChild())},
-         {"{missing}", std::to_string(tree[nid].DefaultChild())}});
+         {"{left}",    std::to_string(tree.LeftChild(nid))},
+         {"{right}",   std::to_string(tree.RightChild(nid))},
+         {"{missing}", std::to_string(tree.DefaultChild(nid))}});
     return result;
   }
 
-  std::string Integer(RegTree const& tree, int32_t nid, uint32_t depth) const override {
+  std::string Integer(TreeView tree, int32_t nid, uint32_t depth) const override {
     static std::string const kIntegerTemplate =
         "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
-    auto cond = tree[nid].SplitCond();
+    auto cond = tree.SplitCond(nid);
     const bst_float floored = std::floor(cond);
     const int32_t integer_threshold
         = (floored == cond) ? static_cast<int>(floored)
@@ -328,21 +289,21 @@ class TextGenerator : public TreeGenerator {
                          std::to_string(integer_threshold), depth);
   }
 
-  std::string Quantitive(RegTree const& tree, int32_t nid, uint32_t depth) const override {
+  std::string Quantitive(TreeView tree, int32_t nid, uint32_t depth) const override {
     static std::string const kQuantitiveTemplate =
         "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
-    auto cond = tree[nid].SplitCond();
+    auto cond = tree.SplitCond(nid);
     return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
   }
 
-  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
-    auto cond = tree[nid].SplitCond();
+  std::string PlainNode(TreeView tree, int32_t nid, uint32_t depth) const override {
+    auto cond = tree.SplitCond(nid);
     static std::string const kNodeTemplate =
         "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
     return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
   }
 
-  std::string Categorical(RegTree const& tree, bst_node_t nid, uint32_t depth) const override {
+  std::string Categorical(TreeView tree, bst_node_t nid, uint32_t depth) const override {
     auto cats = GetSplitCategories(tree, nid);
     std::string cats_str = PrintCatsAsSet(cats);
     static std::string const kNodeTemplate =
@@ -351,50 +312,44 @@ class TextGenerator : public TreeGenerator {
     return result;
   }
 
-  std::string NodeStat(RegTree const& tree, bst_node_t nid) const override {
+  std::string NodeStat(TreeView tree, bst_node_t nid) const override {
     static std::string const kStatTemplate = ",gain={loss_chg},cover={sum_hess}";
     std::string const result = SuperT::Match(
         kStatTemplate,
-        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.LossChg(nid))},
+         {"{sum_hess}", ToStr(tree.SumHess(nid))}});
     return result;
   }
 
-  std::string BuildTree(RegTree const& tree, int32_t nid, uint32_t depth) override {
-    if (tree[nid].IsLeaf()) {
+  std::string BuildTree(TreeView tree, int32_t nid, uint32_t depth) override {
+    if (tree.IsLeaf(nid)) {
       return this->LeafNode(tree, nid, depth);
     }
     static std::string const kNodeTemplate = "{parent}{stat}\n{left}\n{right}";
     auto result = SuperT::Match(
         kNodeTemplate,
         {{"{parent}", this->SplitNode(tree, nid, depth)},
-         {"{stat}",   with_stats_ ? this->NodeStat(tree, nid) : ""},
-         {"{left}",   this->BuildTree(tree, tree[nid].LeftChild(), depth+1)},
-         {"{right}",  this->BuildTree(tree, tree[nid].RightChild(), depth+1)}});
+         {"{stat}",   SuperT::with_stats_ ? this->NodeStat(tree, nid) : ""},
+         {"{left}",   this->BuildTree(tree, tree.LeftChild(nid), depth+1)},
+         {"{right}",  this->BuildTree(tree, tree.RightChild(nid), depth+1)}});
     return result;
   }
 
-  void BuildTree(RegTree const& tree) override {
+  void BuildTree(TreeView tree) override {
     static std::string const& kTreeTemplate = "{nodes}\n";
     auto result = SuperT::Match(
         kTreeTemplate,
         {{"{nodes}", this->BuildTree(tree, 0, 0)}});
-    ss_ << result;
+    SuperT::ss_ << result;
   }
 };
 
-XGBOOST_REGISTER_TREE_IO(TextGenerator, "text")
-    .describe("Dump text representation of tree")
-    .set_body([](FeatureMap const& fmap, std::string const& /*attrs*/, bool with_stats) {
-      return new TextGenerator(fmap, with_stats);
-    });
-
-class JsonGenerator : public TreeGenerator {
-  using SuperT = TreeGenerator;
+template <typename TreeView>
+class JsonGenerator : public TreeGenerator<TreeView> {
+  using SuperT = TreeGenerator<TreeView>;
 
  public:
-  JsonGenerator(FeatureMap const& fmap, bool with_stats) :
-      TreeGenerator(fmap, with_stats) {}
+  JsonGenerator(FeatureMap const& fmap, bool with_stats) : SuperT{fmap, with_stats} {}
 
   std::string Indent(uint32_t depth) const {
     std::string result;
@@ -404,38 +359,36 @@ class JsonGenerator : public TreeGenerator {
     return result;
   }
 
-  std::string LeafNode(RegTree const& tree, bst_node_t nid, uint32_t) const override {
+  std::string LeafNode(TreeView tree, bst_node_t nid, uint32_t) const override {
     static std::string const kLeafTemplate =
         R"L({ "nodeid": {nid}, "leaf": {leaf} {stat}})L";
     static std::string const kStatTemplate =
         R"S(, "cover": {sum_hess} )S";
     std::string result = SuperT::Match(
         kLeafTemplate,
-        {{"{nid}",  std::to_string(nid)},
-         {"{leaf}", ToStr(tree[nid].LeafValue())},
-         {"{stat}", with_stats_ ? SuperT::Match(
-             kStatTemplate,
-             {{"{sum_hess}",
-               ToStr(tree.Stat(nid).sum_hess)}})  : ""}});
+        {{"{nid}", std::to_string(nid)},
+         {"{leaf}", ToStr(tree.LeafValue(nid))},
+         {"{stat}", SuperT::with_stats_
+                        ? SuperT::Match(kStatTemplate, {{"{sum_hess}", ToStr(tree.SumHess(nid))}})
+                        : ""}});
     return result;
   }
 
-  std::string Indicator(RegTree const& tree, bst_node_t nid, uint32_t depth) const override {
-    int32_t nyes = tree[nid].DefaultLeft() ?
-                   tree[nid].RightChild() : tree[nid].LeftChild();
+  std::string Indicator(TreeView tree, bst_node_t nid, uint32_t depth) const override {
+    int32_t nyes = tree.DefaultLeft(nid) ? tree.RightChild(nid) : tree.LeftChild(nid);
     static std::string const kIndicatorTemplate =
         R"ID( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", "yes": {yes}, "no": {no})ID";
-    auto split_index = tree[nid].SplitIndex();
+    auto split_index = tree.SplitIndex(nid);
     auto result =
         SuperT::Match(kIndicatorTemplate, {{"{nid}", std::to_string(nid)},
                                            {"{depth}", std::to_string(depth)},
-                                           {"{fname}", GetFeatureName(fmap_, split_index)},
+                                           {"{fname}", GetFeatureName(SuperT::fmap_, split_index)},
                                            {"{yes}", std::to_string(nyes)},
-                                           {"{no}", std::to_string(tree[nid].DefaultChild())}});
+                                           {"{no}", std::to_string(tree.DefaultChild(nid))}});
     return result;
   }
 
-  std::string Categorical(RegTree const& tree, bst_node_t nid, uint32_t depth) const override {
+  std::string Categorical(TreeView tree, bst_node_t nid, uint32_t depth) const override {
     auto cats = GetSplitCategories(tree, nid);
     static std::string const kCategoryTemplate =
         R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
@@ -453,22 +406,22 @@ class JsonGenerator : public TreeGenerator {
     return results;
   }
 
-  std::string SplitNodeImpl(RegTree const& tree, bst_node_t nid, std::string const& template_str,
+  std::string SplitNodeImpl(TreeView tree, bst_node_t nid, std::string const& template_str,
                             std::string cond, uint32_t depth) const {
-    auto split_index = tree[nid].SplitIndex();
+    auto split_index = tree.SplitIndex(nid);
     std::string const result =
         SuperT::Match(template_str, {{"{nid}", std::to_string(nid)},
                                      {"{depth}", std::to_string(depth)},
-                                     {"{fname}", GetFeatureName(fmap_, split_index)},
+                                     {"{fname}", GetFeatureName(SuperT::fmap_, split_index)},
                                      {"{cond}", cond},
-                                     {"{left}", std::to_string(tree[nid].LeftChild())},
-                                     {"{right}", std::to_string(tree[nid].RightChild())},
-                                     {"{missing}", std::to_string(tree[nid].DefaultChild())}});
+                                     {"{left}", std::to_string(tree.LeftChild(nid))},
+                                     {"{right}", std::to_string(tree.RightChild(nid))},
+                                     {"{missing}", std::to_string(tree.DefaultChild(nid))}});
     return result;
   }
 
-  std::string Integer(RegTree const& tree, int32_t nid, uint32_t depth) const override {
-    auto cond = tree[nid].SplitCond();
+  std::string Integer(TreeView tree, int32_t nid, uint32_t depth) const override {
+    auto cond = tree.SplitCond(nid);
     const bst_float floored = std::floor(cond);
     const int32_t integer_threshold
         = (floored == cond) ? static_cast<int32_t>(floored)
@@ -481,17 +434,17 @@ class JsonGenerator : public TreeGenerator {
                          std::to_string(integer_threshold), depth);
   }
 
-  std::string Quantitive(RegTree const& tree, int32_t nid, uint32_t depth) const override {
+  std::string Quantitive(TreeView tree, int32_t nid, uint32_t depth) const override {
     static std::string const kQuantitiveTemplate =
         R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
         R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
         R"I("missing": {missing})I";
-    bst_float cond = tree[nid].SplitCond();
+    bst_float cond = tree.SplitCond(nid);
     return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
   }
 
-  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
-    auto cond = tree[nid].SplitCond();
+  std::string PlainNode(TreeView tree, int32_t nid, uint32_t depth) const override {
+    auto cond = tree.SplitCond(nid);
     static std::string const kNodeTemplate =
         R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
         R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
@@ -499,48 +452,41 @@ class JsonGenerator : public TreeGenerator {
     return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
   }
 
-  std::string NodeStat(RegTree const& tree, int32_t nid) const override {
+  std::string NodeStat(TreeView tree, int32_t nid) const override {
     static std::string kStatTemplate =
         R"S(, "gain": {loss_chg}, "cover": {sum_hess})S";
     auto result = SuperT::Match(
         kStatTemplate,
-        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.LossChg(nid))},
+         {"{sum_hess}", ToStr(tree.SumHess(nid))}});
     return result;
   }
 
-  std::string SplitNode(RegTree const& tree, int32_t nid, uint32_t depth) override {
+  std::string SplitNode(TreeView tree, int32_t nid, uint32_t depth) override {
     std::string properties = SuperT::SplitNode(tree, nid, depth);
     static std::string const kSplitNodeTemplate =
         "{{properties} {stat}, \"children\": [{left}, {right}\n{indent}]}";
     auto result = SuperT::Match(
         kSplitNodeTemplate,
         {{"{properties}", properties},
-         {"{stat}",   with_stats_ ? this->NodeStat(tree, nid) : ""},
-         {"{left}",   this->BuildTree(tree, tree[nid].LeftChild(), depth+1)},
-         {"{right}",  this->BuildTree(tree, tree[nid].RightChild(), depth+1)},
+         {"{stat}",   SuperT::with_stats_ ? this->NodeStat(tree, nid) : ""},
+         {"{left}",   this->BuildTree(tree, tree.LeftChild(nid), depth+1)},
+         {"{right}",  this->BuildTree(tree, tree.RightChild(nid), depth+1)},
          {"{indent}", this->Indent(depth)}});
     return result;
   }
 
-  std::string BuildTree(RegTree const& tree, int32_t nid, uint32_t depth) override {
+  std::string BuildTree(TreeView tree, int32_t nid, uint32_t depth) override {
     static std::string const kNodeTemplate = "{newline}{indent}{nodes}";
     auto result = SuperT::Match(
-        kNodeTemplate,
-        {{"{newline}", depth == 0 ? "" : "\n"},
-         {"{indent}", Indent(depth)},
-         {"{nodes}",  tree[nid].IsLeaf() ? this->LeafNode(tree, nid, depth) :
-                                           this->SplitNode(tree, nid, depth)}});
+        kNodeTemplate, {{"{newline}", depth == 0 ? "" : "\n"},
+                        {"{indent}", Indent(depth)},
+                        {"{nodes}", tree.IsLeaf(nid) ? this->LeafNode(tree, nid, depth)
+                                                     : this->SplitNode(tree, nid, depth)}});
     return result;
   }
 };
 
-XGBOOST_REGISTER_TREE_IO(JsonGenerator, "json")
-    .describe("Dump json representation of tree")
-    .set_body([](FeatureMap const& fmap, std::string const& /*attrs*/, bool with_stats) {
-      return new JsonGenerator(fmap, with_stats);
-    });
-
 struct GraphvizParam : public XGBoostParameter<GraphvizParam> {
   std::string yes_color;
   std::string no_color;
@@ -573,13 +519,14 @@ struct GraphvizParam : public XGBoostParameter<GraphvizParam> {
 
 DMLC_REGISTER_PARAMETER(GraphvizParam);
 
-class GraphvizGenerator : public TreeGenerator {
-  using SuperT = TreeGenerator;
+template <typename TreeView>
+class GraphvizGenerator : public TreeGenerator<TreeView> {
+  using SuperT = TreeGenerator<TreeView>;
   GraphvizParam param_;
 
  public:
-  GraphvizGenerator(FeatureMap const& fmap, std::string const& attrs, bool with_stats) :
-      TreeGenerator(fmap, with_stats) {
+  GraphvizGenerator(FeatureMap const& fmap, std::string const& attrs, bool with_stats)
+      : SuperT{fmap, with_stats} {
     param_.UpdateAllowUnknown(std::map<std::string, std::string>{});
     using KwArg = std::map<std::string, std::map<std::string, std::string>>;
     KwArg kwargs;
@@ -641,7 +588,7 @@ class GraphvizGenerator : public TreeGenerator {
 
  protected:
   template <bool is_categorical>
-  std::string BuildEdge(RegTree const &tree, bst_node_t nidx, int32_t child, bool left) const {
+  std::string BuildEdge(TreeView tree, bst_node_t nidx, int32_t child, bool left) const {
     static std::string const kEdgeTemplate =
         "    {nid} -> {child} [label=\"{branch}\" color=\"{color}\"]\n";
     // Is this the default child for missing value?
@@ -663,30 +610,32 @@ class GraphvizGenerator : public TreeGenerator {
 
   // Only indicator is different, so we combine all different node types into this
   // function.
-  std::string PlainNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+  std::string PlainNode(TreeView tree, bst_node_t nidx, uint32_t) const override {
     auto split_index = tree.SplitIndex(nidx);
     auto cond = tree.SplitCond(nidx);
     static std::string const kNodeTemplate =
         "    {nid} [ label=\"{fname}{<}{cond}{stat}\" {params}]\n";
 
-    bool has_less =
-        (split_index >= fmap_.Size()) || fmap_.TypeOf(split_index) != FeatureMap::kIndicator;
+    bool has_less = (split_index >= SuperT::fmap_.Size()) ||
+                    SuperT::fmap_.TypeOf(split_index) != FeatureMap::kIndicator;
     std::string result;
     if (this->with_stats_) {
-      CHECK(!tree.IsMultiTarget()) << MTNotImplemented();
-      result = SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)},
-                                             {"{fname}", GetFeatureName(fmap_, split_index)},
-                                             {"{<}", has_less ? "<" : ""},
-                                             {"{cond}", has_less ? ToStr(cond) : ""},
-                                             {"{stat}", this->NodeStat(tree, nidx)},
-                                             {"{params}", param_.condition_node_params}});
+      CHECK(tree::IsScalarTree(tree)) << MTNotImplemented();
+      result =
+          SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)},
+                                        {"{fname}", GetFeatureName(SuperT::fmap_, split_index)},
+                                        {"{<}", has_less ? "<" : ""},
+                                        {"{cond}", has_less ? ToStr(cond) : ""},
+                                        {"{stat}", this->NodeStat(tree, nidx)},
+                                        {"{params}", param_.condition_node_params}});
     } else {
-      result = SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)},
-                                             {"{fname}", GetFeatureName(fmap_, split_index)},
-                                             {"{<}", has_less ? "<" : ""},
-                                             {"{cond}", has_less ? ToStr(cond) : ""},
-                                             {"{stat}", ""},
-                                             {"{params}", param_.condition_node_params}});
+      result =
+          SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)},
+                                        {"{fname}", GetFeatureName(SuperT::fmap_, split_index)},
+                                        {"{<}", has_less ? "<" : ""},
+                                        {"{cond}", has_less ? ToStr(cond) : ""},
+                                        {"{stat}", ""},
+                                        {"{params}", param_.condition_node_params}});
     }
 
     result += BuildEdge<false>(tree, nidx, tree.LeftChild(nidx), true);
@@ -695,13 +644,13 @@ class GraphvizGenerator : public TreeGenerator {
     return result;
   };
 
-  std::string NodeStat(RegTree const& tree, bst_node_t nidx) const override {
-    return Match("\ngain={gain}\ncover={cover}",
-                 {{"{cover}", std::to_string(tree.Stat(nidx).sum_hess)},
-                  {"{gain}", std::to_string(tree.Stat(nidx).loss_chg)}});
+  std::string NodeStat(TreeView tree, bst_node_t nidx) const override {
+    return SuperT::Match("\ngain={gain}\ncover={cover}",
+                         {{"{cover}", ToStr(tree.SumHess(nidx))},
+                          {"{gain}", ToStr(tree.LossChg(nidx))}});
   }
 
-  std::string Categorical(RegTree const& tree, bst_node_t nidx, uint32_t /*depth*/) const override {
+  std::string Categorical(TreeView tree, bst_node_t nidx, uint32_t /*depth*/) const override {
     static std::string const kLabelTemplate =
         "    {nid} [ label=\"{fname}:{cond}{stat}\" {params}]\n";
     auto cats = GetSplitCategories(tree, nidx);
@@ -710,7 +659,7 @@ class GraphvizGenerator : public TreeGenerator {
 
     std::string result =
         SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nidx)},
-                                       {"{fname}", GetFeatureName(fmap_, split_index)},
+                                       {"{fname}", GetFeatureName(SuperT::fmap_, split_index)},
                                        {"{cond}", cats_str},
                                        {"{stat}", this->NodeStat(tree, nidx)},
                                        {"{params}", param_.condition_node_params}});
@@ -721,41 +670,27 @@ class GraphvizGenerator : public TreeGenerator {
     return result;
   }
 
-  std::string LeafNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+  std::string LeafNode(TreeView tree, bst_node_t nidx, uint32_t) const override {
     static std::string const kCoverTemplate = "\ncover={cover}";
     static std::string const kLeafTemplate =
         "    {nid} [ label=\"leaf={leaf-value}{cover}\" {params}]\n";
-    auto plot = [&](std::string cover) {
-      if (tree.IsMultiTarget()) {
-        auto value = tree.GetMultiTargetTree()->LeafValue(nidx);
-        // Hardcoded limit to avoid dumping long arrays into dot graph.
-        bst_target_t constexpr kLimit{3};
-        return SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
-                                             {"{leaf-value}", ToStr(value, kLimit)},
-                                             {"{cover}", std::move(cover)},
-                                             {"{params}", param_.leaf_node_params}});
-      } else {
-        auto value = tree[nidx].LeafValue();
-        return SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
-                                             {"{leaf-value}", ToStr(value)},
-                                             {"{cover}", std::move(cover)},
-                                             {"{params}", param_.leaf_node_params}});
-      }
-    };
-    if (this->with_stats_) {
-      CHECK(!tree.IsMultiTarget()) << MTNotImplemented();
-      return plot(SuperT::Match(kCoverTemplate, {{"{cover}", ToStr(tree.Stat(nidx).sum_hess)}}));
-    } else {
-      return plot("");
-    }
+    auto value = tree.LeafValue(nidx);
+    return SuperT::Match(
+        kLeafTemplate,
+        {{"{nid}", std::to_string(nidx)},
+         {"{leaf-value}", ToStr(value)},
+         {"{cover}", this->with_stats_
+                         ? SuperT::Match(kCoverTemplate, {{"{cover}", ToStr(tree.SumHess(nidx))}})
+                         : ""},
+         {"{params}", param_.leaf_node_params}});
   }
 
-  std::string BuildTree(RegTree const& tree, bst_node_t nidx, uint32_t depth) override {
+  std::string BuildTree(TreeView tree, bst_node_t nidx, uint32_t depth) override {
     if (tree.IsLeaf(nidx)) {
       return this->LeafNode(tree, nidx, depth);
     }
     static std::string const kNodeTemplate = "{parent}\n{left}\n{right}";
-    auto node = tree.GetSplitTypes()[nidx] == FeatureType::kCategorical
+    auto node = tree.SplitType(nidx) == FeatureType::kCategorical
                     ? this->Categorical(tree, nidx, depth)
                     : this->PlainNode(tree, nidx, depth);
     auto result = SuperT::Match(
@@ -766,7 +701,7 @@ class GraphvizGenerator : public TreeGenerator {
     return result;
   }
 
-  void BuildTree(RegTree const& tree) override {
+  void BuildTree(TreeView tree) override {
     static std::string const kTreeTemplate =
         "digraph {\n"
         "    graph [ rankdir={rankdir} ]\n"
@@ -777,16 +712,10 @@ class GraphvizGenerator : public TreeGenerator {
         {{"{rankdir}",     param_.rankdir},
          {"{graph_attrs}", param_.graph_attrs},
          {"{nodes}",       this->BuildTree(tree, 0, 0)}});
-    ss_ << result;
+    SuperT::ss_ << result;
   };
 };
 
-XGBOOST_REGISTER_TREE_IO(GraphvizGenerator, "dot")
-.describe("Dump graphviz representation of tree")
-.set_body([](FeatureMap const& fmap, std::string attrs, bool with_stats) {
-            return new GraphvizGenerator(fmap, attrs, with_stats);
-          });
-
 constexpr bst_node_t RegTree::kRoot;
 
 void TreeParam::FromJson(Json const& in) {
@@ -809,15 +738,51 @@ void TreeParam::ToJson(Json* p_out) const {
   out["size_leaf_vector"] = std::to_string(this->size_leaf_vector);
 }
 
-std::string RegTree::DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const {
-  if (this->IsMultiTarget() && format != "dot") {
-    LOG(FATAL) << format << " tree dump " << MTNotImplemented();
+template <typename TreeView>
+std::unique_ptr<TreeGenerator<TreeView>> CreateTreeGenerator(std::string const& attrs,
+                                                             FeatureMap const& fmap,
+                                                             bool with_stats) {
+  auto pos = attrs.find(':');
+  std::string name;
+  std::string params;
+  if (pos != std::string::npos) {
+    name = attrs.substr(0, pos);
+    params = attrs.substr(pos + 1, attrs.length() - pos - 1);
+    // Eliminate all occurrences of single quote string.
+    size_t pos = std::string::npos;
+    while ((pos = params.find('\'')) != std::string::npos) {
+      params.replace(pos, 1, "\"");
+    }
+  } else {
+    name = attrs;
+  }
+  std::unique_ptr<TreeGenerator<TreeView>> ptr;
+  if (name == "dot") {
+    return std::make_unique<GraphvizGenerator<TreeView>>(fmap, params, with_stats);
+  } else if (name == "text") {
+    return std::make_unique<TextGenerator<TreeView>>(fmap, with_stats);
+  } else if (name == "json") {
+    return std::make_unique<JsonGenerator<TreeView>>(fmap, with_stats);
+  } else {
+    LOG(FATAL) << "Unknown Model Builder:" << name;
   }
-  std::unique_ptr<TreeGenerator> builder{TreeGenerator::Create(format, fmap, with_stats)};
-  builder->BuildTree(*this);
+  return {nullptr};
+}
 
-  std::string result = builder->Str();
-  return result;
+std::string RegTree::DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const {
+  auto impl = [](auto builder, auto view) {
+    builder->BuildTree(view);
+    std::string result = builder->Str();
+    return result;
+  };
+  if (this->IsMultiTarget()) {
+    CHECK(!with_stats) << " Tree dump with statistic " << MTNotImplemented();
+    return impl(CreateTreeGenerator<tree::MultiTargetTreeView>(format, fmap, with_stats),
+                this->HostMtView());
+  } else {
+    return impl(CreateTreeGenerator<tree::ScalarTreeView>(format, fmap, with_stats),
+                this->HostScView());
+  }
 }
 
 bool RegTree::Equal(const RegTree& b) const {
diff --git a/src/tree/tree_view.h b/src/tree/tree_view.h
index 03cf3bc921bd..96bd49c3f1c5 100644
--- a/src/tree/tree_view.h
+++ b/src/tree/tree_view.h
@@ -109,13 +109,16 @@ struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView> {
   [[nodiscard]] XGBOOST_DEVICE bool IsRoot(bst_node_t nidx) const {
     return this->nodes[nidx].IsRoot();
   }
+
   [[nodiscard]] RTreeNodeStat const& Stat(bst_node_t nidx) const { return stats[nidx]; }
-  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
+  [[nodiscard]] auto SumHess(bst_node_t nidx) const { return stats[nidx].sum_hess; }
+  [[nodiscard]] auto LossChg(bst_node_t nidx) const { return stats[nidx].loss_chg; }
 
   [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
   [[nodiscard]] XGBOOST_DEVICE RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const {
     return cats;
   }
+  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
 
   XGBOOST_DEVICE explicit ScalarTreeView(RegTree::Node const* nodes, RTreeNodeStat const* stats,
                                          RegTree::CategoricalSplitMatrix cats, bst_node_t n_nodes)
@@ -184,8 +187,18 @@ struct MultiTargetTreeView : public WalkTreeMixIn<MultiTargetTreeView> {
   [[nodiscard]] bst_node_t Size() const { return this->n; }
   [[nodiscard]] XGBOOST_DEVICE bool IsRoot(bst_node_t nidx) const { return nidx == RegTree::kRoot; }
 
+  [[nodiscard]] auto SumHess(bst_node_t) const {
+    LOG(FATAL) << "Tree statistic " << MTNotImplemented();
+    return linalg::MakeVec<float>(nullptr, 0);
+  }
+  [[nodiscard]] auto LossChg(bst_node_t) const {
+    LOG(FATAL) << "Tree statistic " << MTNotImplemented();
+    return 0.0f;
+  }
+
   [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
   [[nodiscard]] RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const { return cats; }
+  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
 
   /** @brief Create a device view */
   explicit MultiTargetTreeView(Context const* ctx, RegTree const* tree);
@@ -208,4 +221,9 @@ template <typename TreeView>
 [[nodiscard]] bool constexpr IsScalarTree() {
   return std::is_same_v<common::GetValueT<TreeView>, ScalarTreeView>;
 }
+
+template <typename TreeView>
+[[nodiscard]] bool constexpr IsScalarTree(TreeView const&) {
+  return IsScalarTree<TreeView>();
+}
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index e0154869f937..903779da5bdc 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -6,7 +6,6 @@
 #include <xgboost/multi_target_tree_model.h>
 #include <xgboost/tree_model.h>  // for RegTree
 
-#include "../helpers.h"
 #include "../../../src/tree/tree_view.h"
 
 namespace xgboost {
@@ -55,7 +54,8 @@ TEST(MultiTargetTree, JsonIO) {
   check_jtree(jtree1, *tree);
 }
 
-TEST(MultiTargetTree, DumpDot) {
+namespace {
+void TestTreeDump(std::string format, std::string leaf_key) {
   auto tree = MakeTreeForTest();
   auto n_features = tree->NumFeatures();
   FeatureMap fmap;
@@ -63,21 +63,30 @@ TEST(MultiTargetTree, DumpDot) {
     auto name = "feat_" + std::to_string(f);
     fmap.PushBack(f, name.c_str(), "q");
   }
-  auto str = tree->DumpModel(fmap, false, "dot");
-  ASSERT_NE(str.find("leaf=[2, 3, 4]"), std::string::npos);
-  ASSERT_NE(str.find("leaf=[3, 4, 5]"), std::string::npos);
+  {
+    auto str = tree->DumpModel(fmap, false, format);
+    ASSERT_NE(str.find(leaf_key + "[2, 3, 4]"), std::string::npos);
+    ASSERT_NE(str.find(leaf_key + "[3, 4, 5]"), std::string::npos);
+  }
 
   {
+    // Test the "..."
     bst_target_t n_targets{4};
-    bst_feature_t n_features{4};
     RegTree tree{n_targets, n_features};
     linalg::Vector<float> weight{{1.0f, 2.0f, 3.0f, 4.0f}, {4ul}, DeviceOrd::CPU()};
     tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, weight.HostView(),
                     weight.HostView(), weight.HostView());
-    auto str = tree.DumpModel(fmap, false, "dot");
-    ASSERT_NE(str.find("leaf=[1, 2, ..., 4]"), std::string::npos);
+    auto str = tree.DumpModel(fmap, false, format);
+    ASSERT_NE(str.find(leaf_key + "[1, 2, ..., 4]"), std::string::npos);
   }
 }
+}  // namespace
+
+TEST(MultiTargetTree, DotDump) { TestTreeDump("dot", "leaf="); }
+
+TEST(MultiTargetTree, TextDump) { TestTreeDump("text", "leaf="); }
+
+TEST(MultiTargetTree, JsonDump) { TestTreeDump("json", "\"leaf\": "); }
 
 TEST(MultiTargetTree, View) {
   auto tree = MakeTreeForTest();

From b3124484fcbf7b8d674e8a41e25b57ba4b0cbbc4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 14 Oct 2025 04:55:12 +0800
Subject: [PATCH 195/224] Fix pylint errors. (#11748)

---
 python-package/pyproject.toml              |  1 +
 python-package/pyproject.toml.in           |  1 +
 python-package/xgboost/_typing.py          |  2 +-
 python-package/xgboost/callback.py         |  1 -
 python-package/xgboost/collective.py       |  2 +-
 python-package/xgboost/compat.py           |  2 +-
 python-package/xgboost/core.py             |  4 +---
 python-package/xgboost/dask/__init__.py    |  2 +-
 python-package/xgboost/dask/utils.py       |  1 -
 python-package/xgboost/data.py             |  2 +-
 python-package/xgboost/plotting.py         |  2 +-
 python-package/xgboost/sklearn.py          | 10 +++++-----
 python-package/xgboost/spark/core.py       |  4 ++--
 python-package/xgboost/spark/estimator.py  |  2 +-
 python-package/xgboost/testing/__init__.py |  2 +-
 python-package/xgboost/testing/dask.py     |  1 -
 python-package/xgboost/testing/data.py     |  2 +-
 python-package/xgboost/testing/ordinal.py  |  1 -
 python-package/xgboost/testing/predict.py  |  2 +-
 python-package/xgboost/testing/shared.py   |  1 -
 python-package/xgboost/testing/with_skl.py |  2 +-
 python-package/xgboost/training.py         |  3 +--
 22 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index d2106e8182a8..af4230ea2835 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -62,6 +62,7 @@ ignore = ["tests"]
 extension-pkg-whitelist = ["numpy", "cuda"]
 disable = [
     "import-error",
+    "invalid-name",
     "attribute-defined-outside-init",
     "import-outside-toplevel",
     "too-few-public-methods",
diff --git a/python-package/pyproject.toml.in b/python-package/pyproject.toml.in
index 74dccb458eb5..fc4584e880b0 100644
--- a/python-package/pyproject.toml.in
+++ b/python-package/pyproject.toml.in
@@ -61,6 +61,7 @@ ignore = ["tests"]
 extension-pkg-whitelist = ["numpy", "cuda"]
 disable = [
     "import-error",
+    "invalid-name",
     "attribute-defined-outside-init",
     "import-outside-toplevel",
     "too-few-public-methods",
diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
index dedebdef7c8a..b1df93f6f348 100644
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -40,7 +40,7 @@
 
 CupyT = ArrayLike  # maybe need a stub for cupy arrays
 NumpyOrCupy = Any
-NumpyDType = Union[str, Type[np.number]]  # pylint: disable=invalid-name
+NumpyDType = Union[str, Type[np.number]]
 PandasDType = Any  # real type is pandas.core.dtypes.base.ExtensionDtype
 
 FloatCompatible = Union[float, np.float32, np.float64]
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 2242ac3c9818..30d8147417f0 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -55,7 +55,6 @@ class TrainingCallback(ABC):
 
     """
 
-    # pylint: disable=invalid-name
     EvalsLog: TypeAlias = EvalsLog
 
     def __init__(self) -> None:
diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py
index 48c7574786dc..ac75d54c51af 100644
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@@ -256,7 +256,7 @@ class Op(IntEnum):
     BITWISE_XOR = 5
 
 
-def allreduce(data: np.ndarray, op: Op) -> np.ndarray:  # pylint:disable=invalid-name
+def allreduce(data: np.ndarray, op: Op) -> np.ndarray:
     """Perform allreduce, return the result.
 
     Parameters
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 87c3bffb8bb5..3556d26d08d1 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name,unused-import
+# pylint: disable=unused-import
 """For compatibility and optional dependencies."""
 import functools
 import importlib.util
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 6caab146e834..655727e39d93 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-arguments, too-many-branches, invalid-name
+# pylint: disable=too-many-arguments, too-many-branches
 # pylint: disable=too-many-lines, too-many-locals
 """Core XGBoost Library."""
 
@@ -1977,7 +1977,6 @@ def __init__(
         cache: Optional[Sequence[DMatrix]] = None,
         model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None,
     ) -> None:
-        # pylint: disable=invalid-name
         """
         Parameters
         ----------
@@ -2509,7 +2508,6 @@ def eval_set(
         feval: Optional[Metric] = None,
         output_margin: bool = True,
     ) -> str:
-        # pylint: disable=invalid-name
         """Evaluate a set of data.
 
         Parameters
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 2b59551edf39..252a34d6c1ed 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -1,5 +1,5 @@
 # pylint: disable=too-many-arguments, too-many-locals
-# pylint: disable=missing-class-docstring, invalid-name
+# pylint: disable=missing-class-docstring
 # pylint: disable=too-many-lines
 """
 Dask extensions for distributed training
diff --git a/python-package/xgboost/dask/utils.py b/python-package/xgboost/dask/utils.py
index 9026bce1a60e..e9a004f6e8d1 100644
--- a/python-package/xgboost/dask/utils.py
+++ b/python-package/xgboost/dask/utils.py
@@ -1,4 +1,3 @@
-# pylint: disable=invalid-name
 """Utilities for the XGBoost Dask interface."""
 
 import logging
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index ad774eaf83c3..f0917c615d39 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -1619,7 +1619,7 @@ class SingleBatchInternalIter(DataIter):  # pylint: disable=R0902
 
     def __init__(self, **kwargs: Any) -> None:
         self.kwargs = kwargs
-        self.it = 0  # pylint: disable=invalid-name
+        self.it = 0
 
         # This does not necessarily increase memory usage as the data transformation
         # might use memory.
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
index 2d9d99b4e4cb..57279cd13757 100644
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-locals, too-many-arguments, invalid-name,
+# pylint: disable=too-many-locals, too-many-arguments
 # pylint: disable=too-many-branches
 """Plotting Library."""
 import json
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 48cbc003cc12..5b053686026b 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-lines
+# pylint: disable=too-many-arguments, too-many-locals, fixme, too-many-lines
 """Scikit-Learn Wrapper interface for XGBoost."""
 
 import collections
@@ -1057,7 +1057,7 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]:
         #                     XGBRegressor -> XGBModel -> BaseEstimator
         #                                     XGBModel -> BaseEstimator
         #
-        params = super().get_params(deep)
+        params = super().get_params(deep)  # pylint: disable=no-member
         cp = copy.copy(self)
         # If the immediate parent defines get_params(), use that.
         if callable(getattr(cp.__class__.__bases__[0], "get_params", None)):
@@ -1277,7 +1277,7 @@ def fit(
         base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
         feature_weights: Optional[ArrayLike] = None,
     ) -> "XGBModel":
-        # pylint: disable=invalid-name,attribute-defined-outside-init
+        # pylint: disable=attribute-defined-outside-init
         """Fit gradient boosting model.
 
         Note that calling ``fit()`` multiple times will cause the model object to be
@@ -1692,7 +1692,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
 """,
 )
 class XGBClassifier(XGBClassifierBase, XGBModel):
-    # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
+    # pylint: disable=missing-docstring,too-many-instance-attributes
     @_deprecate_positional_args
     def __init__(
         self,
@@ -2140,7 +2140,7 @@ def _get_qid(
         :py:meth:`fit` for more info.""",
 )
 class XGBRanker(XGBRankerMixIn, XGBModel):
-    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
+    # pylint: disable=missing-docstring,too-many-arguments
     @_deprecate_positional_args
     def __init__(self, *, objective: str = "rank:ndcg", **kwargs: Any):
         super().__init__(objective=objective, **kwargs)
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 7d337a10de00..f7640cc7699e 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -2,7 +2,7 @@
 
 import base64
 
-# pylint: disable=fixme, protected-access, no-member, invalid-name
+# pylint: disable=fixme, protected-access, no-member
 # pylint: disable=too-many-lines, too-many-branches
 import json
 import logging
@@ -626,7 +626,7 @@ def __init__(self) -> None:
 
         self.logger = get_logger(self.__class__.__name__)
 
-    def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
+    def setParams(self, **kwargs: Any) -> None:
         """
         Set params for the estimator.
         """
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 935b013ab06c..aae4412a2fd6 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -1,6 +1,6 @@
 """Xgboost pyspark integration submodule for estimator API."""
 
-# pylint: disable=protected-access, no-member, invalid-name
+# pylint: disable=protected-access, no-member
 # pylint: disable=unused-argument, too-many-locals
 
 from typing import Any, List, Optional, Type, Union
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 8edfb0dda163..1fd49db89533 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -3,7 +3,7 @@
 
 """
 
-# pylint: disable=invalid-name,missing-function-docstring
+# pylint: disable=missing-function-docstring
 import importlib.util
 import os
 import platform
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index 84b8988fd41c..79ca59e2b9ba 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -1,4 +1,3 @@
-# pylint: disable=invalid-name
 """Tests for dask shared by different test modules."""
 
 from typing import Any, List, Literal, Tuple, Type, cast
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index f61217d7ea9b..0aa1c915a8ec 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, too-many-lines
+# pylint: disable=too-many-lines
 """Utilities for data generation."""
 import gc
 import multiprocessing
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 48bef6192251..cbdcd665f2c8 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -1,4 +1,3 @@
-# pylint: disable=invalid-name
 """Tests for the ordinal re-coder."""
 
 import itertools
diff --git a/python-package/xgboost/testing/predict.py b/python-package/xgboost/testing/predict.py
index 7ff212eff2fd..9f2d42aeed03 100644
--- a/python-package/xgboost/testing/predict.py
+++ b/python-package/xgboost/testing/predict.py
@@ -12,7 +12,7 @@
 from .utils import Device
 
 
-# pylint: disable=invalid-name,too-many-locals
+# pylint: disable=too-many-locals
 def run_predict_leaf(device: Device, DMatrixT: Type[DMatrix]) -> np.ndarray:
     """Run tests for leaf index prediction."""
     rows = 100
diff --git a/python-package/xgboost/testing/shared.py b/python-package/xgboost/testing/shared.py
index 32d5962e7c30..fcdfe5a6e138 100644
--- a/python-package/xgboost/testing/shared.py
+++ b/python-package/xgboost/testing/shared.py
@@ -1,6 +1,5 @@
 """Testing code shared by other tests."""
 
-# pylint: disable=invalid-name
 import collections
 import importlib.util
 import json
diff --git a/python-package/xgboost/testing/with_skl.py b/python-package/xgboost/testing/with_skl.py
index 04851ffddf12..6bbffafbdbf4 100644
--- a/python-package/xgboost/testing/with_skl.py
+++ b/python-package/xgboost/testing/with_skl.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, too-many-arguments, too-many-positional-arguments
+# pylint: disable=too-many-arguments, too-many-positional-arguments
 """Tests for compatiblity with sklearn."""
 
 from typing import Callable, Optional, Type
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index 125446981b42..75cebeff60d4 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-locals, too-many-arguments, invalid-name
+# pylint: disable=too-many-locals, too-many-arguments
 # pylint: disable=too-many-branches, too-many-statements
 """Training Library containing training routines."""
 import copy
@@ -452,7 +452,6 @@ def cv(
     shuffle: bool = True,
     custom_metric: Optional[Metric] = None,
 ) -> Union[Dict[str, float], "PdDataFrame"]:
-    # pylint: disable = invalid-name
     """Cross-validation with given parameters.
 
     Parameters

From 8b55aeee9487a06794611667a35d698a9a8a7256 Mon Sep 17 00:00:00 2001
From: Jens Goossens <82045265+jensgoossens-tomtom@users.noreply.github.com>
Date: Tue, 14 Oct 2025 09:50:34 +0200
Subject: [PATCH 196/224] [jvm-packages] remove synchronized from predict
 (#11746)

---
 .../java/ml/dmlc/xgboost4j/java/Booster.java  | 10 +++----
 .../dmlc/xgboost4j/java/BoosterImplTest.java  | 30 +++++++++++++++++++
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 43a2d39855d8..6310f2b7b082 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -362,11 +362,11 @@ public String evalSet(DMatrix[] evalMatrixs, String[] evalNames, IEvaluation eva
    * @param predContribs prediction feature contributions
    * @return predict two dimensional array of results, where each row corresponds to a prediction.
    */
-  private synchronized float[][] predict(DMatrix data,
-                                         boolean outputMargin,
-                                         int treeLimit,
-                                         boolean predLeaf,
-                                         boolean predContribs) throws XGBoostError {
+  private float[][] predict(DMatrix data,
+                            boolean outputMargin,
+                            int treeLimit,
+                            boolean predLeaf,
+                            boolean predContribs) throws XGBoostError {
     int optionMask = 0;
     if (outputMargin) {
       optionMask = 1;
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index 90b38d99922f..5b55e0a6342e 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -897,4 +897,34 @@ public void testGetNumFeature() throws XGBoostError {
     Booster booster = trainBooster(trainMat, testMat);
     TestCase.assertEquals(booster.getNumFeature(), 126);
   }
+
+  @Test
+  public void testConcurrentPredict() throws InterruptedException, XGBoostError, ExecutionException, TimeoutException {
+    DMatrix trainMat = new DMatrix(this.train_uri);
+    DMatrix testMat = new DMatrix(this.test_uri);
+    Booster booster = trainBooster(trainMat, testMat);
+
+    float[][] expectedPredictions = booster.predict(testMat);
+
+    ExecutorService executor = Executors.newFixedThreadPool(10);
+    List<CompletableFuture<Void>> futures = new ArrayList<>();
+
+    //10 threads - each calling predict 50 times
+    for (int t = 0; t < 10; t++) {
+      futures.add(CompletableFuture.runAsync(() -> {
+        try {
+          for (int i = 0; i < 50; i++) {
+            float[][] predictions = booster.predict(testMat);
+            assertArrayEquals(expectedPredictions, predictions);
+          }
+        } catch (XGBoostError e) {
+          throw new RuntimeException(e);
+        }
+      }, executor));
+    }
+
+    CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))
+      .get(30, TimeUnit.SECONDS);
+    executor.shutdown();
+  }
 }

From 70b9b107d081be5bd9dcc4df710b70d693cf4048 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 15 Oct 2025 03:35:34 +0800
Subject: [PATCH 197/224] Provide device storage for scalar tree. (#11750)

- Cleanup various accessors.
- Use the view object for all const access.
---
 include/xgboost/multi_target_tree_model.h  |  21 --
 include/xgboost/tree_model.h               | 228 ++++++++-------------
 plugin/sycl/common/host_device_vector.cc   |  11 +-
 plugin/sycl/tree/expand_entry.h            |  19 +-
 plugin/sycl/tree/hist_synchronizer.h       |  44 ++--
 plugin/sycl/tree/hist_updater.cc           |  28 +--
 src/common/host_device_vector.cc           |  11 +-
 src/common/partition_builder.h             |  22 +-
 src/predictor/array_tree_layout.h          |   3 +-
 src/predictor/cpu_predictor.cc             | 103 +++++-----
 src/predictor/gpu_predictor.cu             |   4 +-
 src/predictor/treeshap.cc                  |  46 ++---
 src/predictor/treeshap.h                   |  15 +-
 src/tree/common_row_partitioner.h          |  75 ++++---
 src/tree/hist/evaluate_splits.h            |  37 ++--
 src/tree/hist/histogram.cc                 |  18 +-
 src/tree/hist/histogram.h                  |  49 ++---
 src/tree/tree_model.cc                     | 128 +++++++-----
 src/tree/tree_view.cc                      |  14 +-
 src/tree/tree_view.h                       |  79 ++++---
 src/tree/updater_approx.cc                 |   8 +-
 src/tree/updater_colmaker.cc               |  80 ++++----
 src/tree/updater_gpu_hist.cu               |  34 +--
 src/tree/updater_quantile_hist.cc          |  21 +-
 tests/cpp/plugin/test_sycl_hist_updater.cc |   3 +-
 tests/cpp/predictor/test_cpu_predictor.cc  |  10 +-
 tests/cpp/predictor/test_predictor.cc      |   6 +-
 tests/cpp/tree/hist/test_histogram.cc      |  59 +++---
 tests/cpp/tree/test_approx.cc              |  10 +-
 tests/cpp/tree/test_common_partitioner.cc  |   8 +-
 tests/cpp/tree/test_quantile_hist.cc       |  26 ++-
 tests/cpp/tree/test_tree_model.cc          |  15 +-
 tests/cpp/tree/test_tree_stat.cc           |  53 ++---
 33 files changed, 655 insertions(+), 633 deletions(-)

diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index 76b79f4b4313..2fc110f02b73 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -74,9 +74,6 @@ class MultiTargetTree : public Model {
   [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
     return left_.ConstHostVector()[nidx] == InvalidNodeId();
   }
-  [[nodiscard]] bst_node_t Parent(bst_node_t nidx) const {
-    return parent_.ConstHostVector().at(nidx);
-  }
   [[nodiscard]] bst_node_t LeftChild(bst_node_t nidx) const {
     return left_.ConstHostVector().at(nidx);
   }
@@ -84,27 +81,9 @@ class MultiTargetTree : public Model {
     return right_.ConstHostVector().at(nidx);
   }
 
-  [[nodiscard]] bst_feature_t SplitIndex(bst_node_t nidx) const {
-    return split_index_.ConstHostVector()[nidx];
-  }
-  [[nodiscard]] float SplitCond(bst_node_t nidx) const {
-    return split_conds_.ConstHostVector()[nidx];
-  }
-  [[nodiscard]] bool DefaultLeft(bst_node_t nidx) const {
-    return default_left_.ConstHostVector()[nidx];
-  }
-  [[nodiscard]] bst_node_t DefaultChild(bst_node_t nidx) const {
-    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
-  }
-
   [[nodiscard]] bst_target_t NumTargets() const;
-
   [[nodiscard]] auto NumLeaves() const { return this->weights_.Size() / this->NumTargets(); }
 
-  [[nodiscard]] bool IsLeftChild(bst_node_t nidx) const {
-    auto p = this->Parent(nidx);
-    return nidx == this->LeftChild(p);
-  }
   [[nodiscard]] std::size_t Size() const;
   [[nodiscard]] MultiTargetTree* Copy(TreeParam const* param) const;
 
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 65b00ef12040..fcf283882f8b 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -204,42 +204,47 @@ class RegTree : public Model {
     Info info_;
   };
 
-  /*!
-   * \brief change a non leaf node to a leaf node, delete its children
-   * \param rid node id of the node
-   * \param value new leaf value
+  /**
+   * @brief Change a non leaf node to a leaf node, delete its children
+   *
+   * @param nidx Node id
+   * @param value The new leaf value
    */
-  void ChangeToLeaf(int rid, bst_float value) {
-    CHECK(nodes_[nodes_[rid].LeftChild() ].IsLeaf());
-    CHECK(nodes_[nodes_[rid].RightChild()].IsLeaf());
-    this->DeleteNode(nodes_[rid].LeftChild());
-    this->DeleteNode(nodes_[rid].RightChild());
-    nodes_[rid].SetLeaf(value);
+  void ChangeToLeaf(bst_node_t nidx, float value) {
+    auto& h_nodes = nodes_.HostVector();
+    CHECK(h_nodes[h_nodes[nidx].LeftChild()].IsLeaf());
+    CHECK(h_nodes[h_nodes[nidx].RightChild()].IsLeaf());
+    this->DeleteNode(h_nodes[nidx].LeftChild());
+    this->DeleteNode(h_nodes[nidx].RightChild());
+    h_nodes[nidx].SetLeaf(value);
   }
-  /*!
-   * \brief collapse a non leaf node to a leaf node, delete its children
-   * \param rid node id of the node
-   * \param value new leaf value
+  /**
+   * @brief Collapse a non leaf node to a leaf node, delete its children
+   *
+   * @param nidx Node id
+   * @param value The new leaf value
    */
-  void CollapseToLeaf(int rid, bst_float value) {
-    if (nodes_[rid].IsLeaf()) return;
-    if (!nodes_[nodes_[rid].LeftChild() ].IsLeaf()) {
-      CollapseToLeaf(nodes_[rid].LeftChild(), 0.0f);
+  void CollapseToLeaf(bst_node_t nidx, float value) {
+    auto& h_nodes = nodes_.HostVector();
+    if (h_nodes[nidx].IsLeaf()) return;
+    if (!h_nodes[h_nodes[nidx].LeftChild()].IsLeaf()) {
+      CollapseToLeaf(h_nodes[nidx].LeftChild(), 0.0f);
     }
-    if (!nodes_[nodes_[rid].RightChild() ].IsLeaf()) {
-      CollapseToLeaf(nodes_[rid].RightChild(), 0.0f);
+    if (!h_nodes[h_nodes[nidx].RightChild()].IsLeaf()) {
+      CollapseToLeaf(h_nodes[nidx].RightChild(), 0.0f);
     }
-    this->ChangeToLeaf(rid, value);
+    this->ChangeToLeaf(nidx, value);
   }
 
   RegTree() {
-    nodes_.resize(param_.num_nodes);
-    stats_.resize(param_.num_nodes);
-    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param_.num_nodes);
+    nodes_.HostVector().resize(param_.num_nodes);
+    stats_.HostVector().resize(param_.num_nodes);
+    split_types_.HostVector().resize(param_.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.HostVector().resize(param_.num_nodes);
+    auto& h_nodes = nodes_.HostVector();
     for (int i = 0; i < param_.num_nodes; i++) {
-      nodes_[i].SetLeaf(0.0f);
-      nodes_[i].SetParent(kInvalidNodeId);
+      h_nodes[i].SetLeaf(0.0f);
+      h_nodes[i].SetParent(kInvalidNodeId);
     }
   }
   /**
@@ -254,34 +259,28 @@ class RegTree : public Model {
   }
 
   /*! \brief get node given nid */
-  Node& operator[](int nid) {
-    return nodes_[nid];
-  }
-  /*! \brief get node given nid */
-  const Node& operator[](int nid) const {
-    return nodes_[nid];
-  }
+  Node& operator[](bst_node_t nidx) { return nodes_.HostVector()[nidx]; }
 
+ public:
   /*! \brief get const reference to nodes */
-  [[nodiscard]] const std::vector<Node>& GetNodes() const { return nodes_; }
+  [[nodiscard]] const std::vector<Node>& GetNodes() const { return nodes_.ConstHostVector(); }
 
   /*! \brief get const reference to stats */
-  [[nodiscard]] const std::vector<RTreeNodeStat>& GetStats() const { return stats_; }
+  [[nodiscard]] const std::vector<RTreeNodeStat>& GetStats() const {
+    return stats_.ConstHostVector();
+  }
 
   /*! \brief get node statistics given nid */
   RTreeNodeStat& Stat(int nid) {
-    return stats_[nid];
-  }
-  /*! \brief get node statistics given nid */
-  [[nodiscard]] const RTreeNodeStat& Stat(int nid) const {
-    return stats_[nid];
+    return stats_.HostVector()[nid];
   }
 
   void LoadModel(Json const& in) override;
   void SaveModel(Json* out) const override;
 
   bool operator==(const RegTree& b) const {
-    return nodes_ == b.nodes_ && stats_ == b.stats_ &&
+    return nodes_.ConstHostVector() == b.nodes_.ConstHostVector() &&
+           stats_.ConstHostVector() == b.stats_.ConstHostVector() &&
            deleted_nodes_ == b.deleted_nodes_ && param_ == b.param_;
   }
   /*!
@@ -344,9 +343,9 @@ class RegTree : public Model {
                          bst_float right_leaf_weight, bst_float loss_change, float sum_hess,
                          float left_sum, float right_sum);
   /**
-   * \brief Whether this tree has categorical split.
+   * @brief Whether this tree has categorical split.
    */
-  [[nodiscard]] bool HasCategoricalSplit() const { return !split_categories_.empty(); }
+  [[nodiscard]] bool HasCategoricalSplit() const { return !split_categories_.Empty(); }
   /**
    * \brief Whether this is a multi-target tree.
    */
@@ -391,26 +390,16 @@ class RegTree : public Model {
    */
   [[nodiscard]] bst_node_t GetDepth(bst_node_t nidx) const;
   /**
-   * \brief Set the leaf weight for a multi-target tree.
+   * @brief Set the leaf weight for a multi-target tree.
    */
   void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
     CHECK(IsMultiTarget());
     return this->p_mt_tree_->SetLeaf(nidx, weight);
   }
-
-  /*!
-   * \brief get maximum depth
-   * \param nid node id
-   */
-  [[nodiscard]] int MaxDepth(int nid) const {
-    if (nodes_[nid].IsLeaf()) return 0;
-    return std::max(MaxDepth(nodes_[nid].LeftChild()) + 1, MaxDepth(nodes_[nid].RightChild()) + 1);
-  }
-
-  /*!
-   * \brief get maximum depth
+  /**
+   * @brief Get the maximum depth.
    */
-  int MaxDepth() { return MaxDepth(0); }
+  [[nodiscard]] bst_node_t MaxDepth() const;
 
   /*!
    * \brief dense feature vector that can be taken by RegTree
@@ -474,35 +463,24 @@ class RegTree : public Model {
    */
   [[nodiscard]] std::string DumpModel(const FeatureMap& fmap, bool with_stats,
                                       std::string format) const;
-  /*!
-   * \brief Get split type for a node.
-   * \param nidx Index of node.
-   * \return The type of this split.  For leaf node it's always kNumerical.
-   */
-  [[nodiscard]] FeatureType NodeSplitType(bst_node_t nidx) const { return split_types_.at(nidx); }
-  /*!
-   * \brief Get split types for all nodes.
+  /**
+   * @brief Get split types for all nodes.
    */
-  [[nodiscard]] std::vector<FeatureType> const& GetSplitTypes() const {
-    return split_types_;
+  [[nodiscard]] common::Span<FeatureType const> GetSplitTypes(DeviceOrd device) const {
+    return device.IsCPU() ? split_types_.ConstHostSpan()
+                          : (split_types_.SetDevice(device), split_types_.ConstDeviceSpan());
   }
-  [[nodiscard]] common::Span<uint32_t const> GetSplitCategories() const {
-    return split_categories_;
+  [[nodiscard]] common::Span<uint32_t const> GetSplitCategories(DeviceOrd device) const {
+    return device.IsCPU()
+               ? split_categories_.ConstHostSpan()
+               : (split_categories_.SetDevice(device), split_categories_.ConstDeviceSpan());
   }
-  /*!
-   * \brief Get the bit storage for categories
-   */
-  [[nodiscard]] common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
-    auto node_ptr = GetCategoriesMatrix().node_ptr;
-    auto categories = GetCategoriesMatrix().categories;
-    auto segment = node_ptr[nidx];
-    auto node_cats = categories.subspan(segment.beg, segment.size);
-    return node_cats;
+  [[nodiscard]] auto const& GetSplitCategoriesPtr() const {
+    return split_categories_segments_.ConstHostVector();
   }
-  [[nodiscard]] auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }
 
   /**
-   * \brief CSR-like matrix for categorical splits.
+   * @brief CSR-like matrix for categorical splits.
    *
    * The fields of split_categories_segments_[i] are set such that the range
    * node_ptr[beg:(beg+size)] stores the bitset for the matching categories for the
@@ -518,78 +496,36 @@ class RegTree : public Model {
     common::Span<Segment const> node_ptr;
   };
 
-  [[nodiscard]] CategoricalSplitMatrix GetCategoriesMatrix() const {
+  [[nodiscard]] CategoricalSplitMatrix GetCategoriesMatrix(DeviceOrd device) const {
     CategoricalSplitMatrix view;
-    view.split_type = common::Span<FeatureType const>(this->GetSplitTypes());
-    view.categories = this->GetSplitCategories();
-    view.node_ptr = common::Span<CategoricalSplitMatrix::Segment const>(split_categories_segments_);
+    view.split_type = this->GetSplitTypes(device);
+    view.categories = this->GetSplitCategories(device);
+    if (device.IsCPU()) {
+      view.node_ptr = split_categories_segments_.ConstHostSpan();
+    } else {
+      split_categories_segments_.SetDevice(device);
+      view.node_ptr = split_categories_segments_.ConstDeviceSpan();
+    }
     return view;
   }
 
-  [[nodiscard]] bst_feature_t SplitIndex(bst_node_t nidx) const {
-    if (IsMultiTarget()) {
-      return this->p_mt_tree_->SplitIndex(nidx);
-    }
-    return (*this)[nidx].SplitIndex();
-  }
-  [[nodiscard]] float SplitCond(bst_node_t nidx) const {
-    if (IsMultiTarget()) {
-      return this->p_mt_tree_->SplitCond(nidx);
-    }
-    return (*this)[nidx].SplitCond();
-  }
-  [[nodiscard]] bool DefaultLeft(bst_node_t nidx) const {
-    if (IsMultiTarget()) {
-      return this->p_mt_tree_->DefaultLeft(nidx);
-    }
-    return (*this)[nidx].DefaultLeft();
-  }
-  [[nodiscard]] bst_node_t DefaultChild(bst_node_t nidx) const {
-    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
-  }
-  [[nodiscard]] bool IsRoot(bst_node_t nidx) const {
-    if (IsMultiTarget()) {
-      return nidx == kRoot;
-    }
-    return (*this)[nidx].IsRoot();
-  }
-  [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
-    if (IsMultiTarget()) {
-      return this->p_mt_tree_->IsLeaf(nidx);
-    }
-    return (*this)[nidx].IsLeaf();
-  }
-  [[nodiscard]] bst_node_t Parent(bst_node_t nidx) const {
-    if (IsMultiTarget()) {
-      return this->p_mt_tree_->Parent(nidx);
-    }
-    return (*this)[nidx].Parent();
-  }
   [[nodiscard]] bst_node_t LeftChild(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->LeftChild(nidx);
     }
-    return (*this)[nidx].LeftChild();
+    return nodes_.ConstHostVector()[nidx].LeftChild();
   }
   [[nodiscard]] bst_node_t RightChild(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->RightChild(nidx);
     }
-    return (*this)[nidx].RightChild();
-  }
-  [[nodiscard]] bool IsLeftChild(bst_node_t nidx) const {
-    if (IsMultiTarget()) {
-      CHECK_NE(nidx, kRoot);
-      auto p = this->p_mt_tree_->Parent(nidx);
-      return nidx == this->p_mt_tree_->LeftChild(p);
-    }
-    return (*this)[nidx].IsLeftChild();
+    return nodes_.ConstHostVector()[nidx].RightChild();
   }
   [[nodiscard]] bst_node_t Size() const {
     if (IsMultiTarget()) {
       return this->p_mt_tree_->Size();
     }
-    return this->nodes_.size();
+    return this->nodes_.Size();
   }
 
   [[nodiscard]] RegTree* Copy() const;
@@ -603,17 +539,17 @@ class RegTree : public Model {
   /*! \brief model parameter */
   TreeParam param_;
   // vector of nodes
-  std::vector<Node> nodes_;
+  HostDeviceVector<Node> nodes_;
   // free node space, used during training process
   std::vector<int>  deleted_nodes_;
   // stats of nodes
-  std::vector<RTreeNodeStat> stats_;
-  std::vector<FeatureType> split_types_;
+  HostDeviceVector<RTreeNodeStat> stats_;
+  HostDeviceVector<FeatureType> split_types_;
 
   // Categories for each internal node.
-  std::vector<uint32_t> split_categories_;
+  HostDeviceVector<uint32_t> split_categories_;
   // Ptr to split categories of each node.
-  std::vector<CategoricalSplitMatrix::Segment> split_categories_segments_;
+  HostDeviceVector<CategoricalSplitMatrix::Segment> split_categories_segments_;
   // ptr to multi-target tree with vector leaf.
   std::unique_ptr<MultiTargetTree> p_mt_tree_;
   // allocate a new node,
@@ -622,17 +558,17 @@ class RegTree : public Model {
     if (param_.num_deleted != 0) {
       int nid = deleted_nodes_.back();
       deleted_nodes_.pop_back();
-      nodes_[nid].Reuse();
+      nodes_.HostVector()[nid].Reuse();
       --param_.num_deleted;
       return nid;
     }
     int nd = param_.num_nodes++;
     CHECK_LT(param_.num_nodes, std::numeric_limits<int>::max())
         << "number of nodes in the tree exceed 2^31";
-    nodes_.resize(param_.num_nodes);
-    stats_.resize(param_.num_nodes);
-    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param_.num_nodes);
+    nodes_.HostVector().resize(param_.num_nodes);
+    stats_.HostVector().resize(param_.num_nodes);
+    split_types_.HostVector().resize(param_.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.HostVector().resize(param_.num_nodes);
     return nd;
   }
   // delete a tree node, keep the parent field to allow trace back
@@ -646,7 +582,7 @@ class RegTree : public Model {
     }
 
     deleted_nodes_.push_back(nid);
-    nodes_[nid].MarkDelete();
+    nodes_.HostVector()[nid].MarkDelete();
     ++param_.num_deleted;
   }
 };
diff --git a/plugin/sycl/common/host_device_vector.cc b/plugin/sycl/common/host_device_vector.cc
index bca5aee45f6e..0a32fae40279 100644
--- a/plugin/sycl/common/host_device_vector.cc
+++ b/plugin/sycl/common/host_device_vector.cc
@@ -399,14 +399,17 @@ template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
-template class HostDeviceVector<int32_t>;   // bst_node_t
-template class HostDeviceVector<uint8_t>;
-template class HostDeviceVector<int8_t>;
+template class HostDeviceVector<std::int32_t>;   // bst_node_t
+template class HostDeviceVector<std::uint8_t>;
+template class HostDeviceVector<std::int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<bst_idx_t>;
-template class HostDeviceVector<uint32_t>;  // bst_feature_t
+template class HostDeviceVector<std::uint32_t>;  // bst_feature_t
+template class HostDeviceVector<RegTree::Node>;
 template class HostDeviceVector<sycl::predictor::Node>;
+template class HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>;
+template class HostDeviceVector<RTreeNodeStat>;
 
 }  // namespace xgboost
 
diff --git a/plugin/sycl/tree/expand_entry.h b/plugin/sycl/tree/expand_entry.h
index 2520ff95db5a..807b27d44275 100644
--- a/plugin/sycl/tree/expand_entry.h
+++ b/plugin/sycl/tree/expand_entry.h
@@ -1,6 +1,5 @@
-/*!
- * Copyright 2017-2024 by Contributors
- * \file expand_entry.h
+/**
+ * Copyright 2017-2025, XGBoost Contributors
  */
 #ifndef PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
 #define PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
@@ -10,6 +9,7 @@
 #include "../../src/tree/constraints.h"
 #pragma GCC diagnostic pop
 #include "../../src/tree/hist/expand_entry.h"
+#include "../../src/tree/tree_view.h"
 
 namespace xgboost {
 namespace sycl {
@@ -22,15 +22,14 @@ struct ExpandEntry : public xgboost::tree::ExpandEntryImpl<ExpandEntry> {
 
   ExpandEntry(int nid, int depth) : ExpandEntryImpl{nid, depth} {}
 
-  inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree) const {
-    CHECK_EQ((*p_tree)[nid].IsRoot(), false);
-    const size_t parent_id = (*p_tree)[nid].Parent();
-    return GetSiblingId(p_tree, parent_id);
+  bst_node_t GetSiblingId(::xgboost::tree::ScalarTreeView const& tree) const {
+    CHECK_EQ(tree.IsRoot(nid), false);
+    const size_t parent_id = tree.Parent(nid);
+    return GetSiblingId(tree, parent_id);
   }
 
-  inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree, size_t parent_id) const {
-    return p_tree->IsLeftChild(nid) ? p_tree->RightChild(parent_id)
-                                    : p_tree->LeftChild(parent_id);
+  bst_node_t GetSiblingId(::xgboost::tree::ScalarTreeView const& tree, size_t parent_id) const {
+    return tree.IsLeftChild(nid) ? tree.RightChild(parent_id) : tree.LeftChild(parent_id);
   }
 
   bool IsValidImpl(xgboost::tree::TrainParam const &param, int32_t num_leaves) const {
diff --git a/plugin/sycl/tree/hist_synchronizer.h b/plugin/sycl/tree/hist_synchronizer.h
index a6c9a6a83aeb..31d84413c41e 100644
--- a/plugin/sycl/tree/hist_synchronizer.h
+++ b/plugin/sycl/tree/hist_synchronizer.h
@@ -1,12 +1,12 @@
-/*!
- * Copyright 2017-2024 by Contributors
- * \file hist_synchronizer.h
+/**
+ * Copyright 2017-2025, XGBoost Contributors
  */
 #ifndef PLUGIN_SYCL_TREE_HIST_SYNCHRONIZER_H_
 #define PLUGIN_SYCL_TREE_HIST_SYNCHRONIZER_H_
 
 #include <vector>
 
+#include "../../src/tree/tree_view.h"
 #include "../common/hist_util.h"
 #include "expand_entry.h"
 
@@ -20,18 +20,17 @@ class HistUpdater;
 template <typename GradientSumT>
 class HistSynchronizer {
  public:
-  virtual void SyncHistograms(HistUpdater<GradientSumT>* builder,
-                              const std::vector<int>& sync_ids,
-                              RegTree *p_tree) = 0;
+  virtual void SyncHistograms(HistUpdater<GradientSumT>* builder, const std::vector<int>& sync_ids,
+                              RegTree const* p_tree) = 0;
   virtual ~HistSynchronizer() = default;
 };
 
 template <typename GradientSumT>
 class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
  public:
-  void SyncHistograms(HistUpdater<GradientSumT>* builder,
-                      const std::vector<int>& sync_ids,
-                      RegTree *p_tree) override {
+  void SyncHistograms(HistUpdater<GradientSumT>* builder, const std::vector<int>& sync_ids,
+                      RegTree const* p_tree) override {
+    auto tree = p_tree->HostScView();
     builder->builder_monitor_.Start("SyncHistograms");
     const size_t nbins = builder->hist_builder_.GetNumBins();
 
@@ -40,10 +39,10 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
       const auto entry = builder->nodes_for_explicit_hist_build_[i];
       auto& this_hist = builder->hist_[entry.nid];
 
-      if (!(*p_tree)[entry.nid].IsRoot()) {
-        const size_t parent_id = (*p_tree)[entry.nid].Parent();
+      if (!(tree).IsRoot(entry.nid)) {
+        const size_t parent_id = tree.Parent(entry.nid);
         auto& parent_hist = builder->hist_[parent_id];
-        auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
+        auto& sibling_hist = builder->hist_[entry.GetSiblingId(tree, parent_id)];
         hist_sync_events_[i] = common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
                                                        this_hist, nbins, ::sycl::event());
       }
@@ -64,9 +63,9 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
 template <typename GradientSumT>
 class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
  public:
-  void SyncHistograms(HistUpdater<GradientSumT>* builder,
-                      const std::vector<int>& sync_ids,
-                      RegTree *p_tree) override {
+  void SyncHistograms(HistUpdater<GradientSumT>* builder, const std::vector<int>& sync_ids,
+                      RegTree const* p_tree) override {
+    auto tree = p_tree->HostScView();
     builder->builder_monitor_.Start("SyncHistograms");
     const size_t nbins = builder->hist_builder_.GetNumBins();
     for (int node = 0; node < builder->nodes_for_explicit_hist_build_.size(); node++) {
@@ -76,9 +75,9 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
       auto& this_local = builder->hist_local_worker_[entry.nid];
       common::CopyHist(builder->qu_, &this_local, this_hist, nbins);
 
-      if (!(*p_tree)[entry.nid].IsRoot()) {
-        const size_t parent_id = (*p_tree)[entry.nid].Parent();
-        auto sibling_nid = entry.GetSiblingId(p_tree, parent_id);
+      if (!tree.IsRoot(entry.nid)) {
+        const size_t parent_id = tree.Parent(entry.nid);
+        auto sibling_nid = entry.GetSiblingId(tree, parent_id);
         auto& parent_hist = builder->hist_local_worker_[parent_id];
 
         auto& sibling_hist = builder->hist_[sibling_nid];
@@ -102,15 +101,16 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
                                const std::vector<ExpandEntry>& nodes,
                                const RegTree * p_tree) {
     const size_t nbins = builder->hist_builder_.GetNumBins();
+    auto tree = p_tree->HostScView();
     for (int node = 0; node < nodes.size(); node++) {
       const auto entry = nodes[node];
-      if (!((*p_tree)[entry.nid].IsLeftChild())) {
+      if (!(tree.IsLeftChild(entry.nid))) {
         auto& this_hist = builder->hist_[entry.nid];
 
-        if (!(*p_tree)[entry.nid].IsRoot()) {
-          const size_t parent_id = (*p_tree)[entry.nid].Parent();
+        if (!tree.IsRoot(entry.nid)) {
+          const size_t parent_id = tree.Parent(entry.nid);
           auto& parent_hist = builder->hist_[parent_id];
-          auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
+          auto& sibling_hist = builder->hist_[entry.GetSiblingId(tree, parent_id)];
           common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
                                   sibling_hist, nbins, ::sycl::event());
           builder->qu_->wait_and_throw();
diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
index 9a1510db4971..67a813d5fc76 100644
--- a/plugin/sycl/tree/hist_updater.cc
+++ b/plugin/sycl/tree/hist_updater.cc
@@ -67,9 +67,10 @@ void HistUpdater<GradientSumT>::BuildHistogramsLossGuide(
   nodes_for_explicit_hist_build_.clear();
   nodes_for_subtraction_trick_.clear();
   nodes_for_explicit_hist_build_.push_back(entry);
+  auto tree = p_tree->HostScView();
 
-  if (!(*p_tree)[entry.nid].IsRoot()) {
-    auto sibling_id = entry.GetSiblingId(p_tree);
+  if (!tree.IsRoot(entry.nid)) {
+    auto sibling_id = entry.GetSiblingId(tree);
     nodes_for_subtraction_trick_.emplace_back(sibling_id, p_tree->GetDepth(sibling_id));
   }
 
@@ -375,19 +376,20 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
 
   size_t n_nodes = row_set_collection_.Size();
   std::vector<::sycl::event> events(n_nodes);
+  auto tree = p_last_tree_->HostScView();
   for (size_t node = 0; node < n_nodes; node++) {
     const common::RowSetCollection::Elem& rowset = row_set_collection_[node];
     if (rowset.begin != nullptr && rowset.end != nullptr && rowset.Size() != 0) {
       int nid = rowset.node_id;
       // if a node is marked as deleted by the pruner, traverse upward to locate
       // a non-deleted leaf.
-      if ((*p_last_tree_)[nid].IsDeleted()) {
-        while ((*p_last_tree_)[nid].IsDeleted()) {
-          nid = (*p_last_tree_)[nid].Parent();
+      if (tree.IsDeleted(nid)) {
+        while (tree.IsDeleted(nid)) {
+          nid = tree.Parent(nid);
         }
-        CHECK((*p_last_tree_)[nid].IsLeaf());
+        CHECK(tree.IsLeaf(nid));
       }
-      bst_float leaf_value = (*p_last_tree_)[nid].LeafValue();
+      bst_float leaf_value = tree.LeafValue(nid);
       const size_t* rid = rowset.begin;
       const size_t num_rows = rowset.Size();
 
@@ -649,7 +651,8 @@ void HistUpdater<GradientSumT>::ApplySplit(
 
   const size_t n_nodes = nodes.size();
   std::vector<int32_t> split_conditions(n_nodes);
-  CommonRowPartitioner::FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
+  auto tree = p_tree->HostScView();
+  CommonRowPartitioner::FindSplitConditions(nodes, tree, gmat, &split_conditions);
 
   partition_builder_.Init(qu_, n_nodes, [&](size_t node_in_set) {
     const int32_t nid = nodes[node_in_set].nid;
@@ -681,8 +684,9 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
   builder_monitor_.Start("InitNewNode");
 
   snode_host_.resize(tree.NumNodes(), NodeEntry<GradientSumT>(param_));
+  auto sc_tree = tree.HostScView();
   {
-    if (tree[nid].IsRoot()) {
+    if (sc_tree.IsRoot(nid)) {
       GradStats<GradientSumT> grad_stat;
       if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
         const std::vector<uint32_t>& row_ptr = gmat.cut.Ptrs();
@@ -724,8 +728,8 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
       SafeColl(rc);
       snode_host_[nid].stats = grad_stat;
     } else {
-      int parent_id = tree[nid].Parent();
-      if (tree[nid].IsLeftChild()) {
+      int parent_id = sc_tree.Parent(nid);
+      if (sc_tree.IsLeftChild(nid)) {
         snode_host_[nid].stats = snode_host_[parent_id].best.left_sum;
       } else {
         snode_host_[nid].stats = snode_host_[parent_id].best.right_sum;
@@ -736,7 +740,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
   // calculating the weights
   {
     auto evaluator = tree_evaluator_.GetEvaluator();
-    bst_uint parentid = tree[nid].Parent();
+    bst_uint parentid = sc_tree.Parent(nid);
     snode_host_[nid].weight = evaluator.CalcWeight(parentid, snode_host_[nid].stats);
     snode_host_[nid].root_gain = evaluator.CalcGain(parentid, snode_host_[nid].stats);
   }
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index ab3d782ec14f..a1201258da49 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -179,13 +179,16 @@ template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
-template class HostDeviceVector<int32_t>;   // bst_node_t
-template class HostDeviceVector<uint8_t>;
-template class HostDeviceVector<int8_t>;
+template class HostDeviceVector<std::int32_t>;   // bst_node_t
+template class HostDeviceVector<std::uint8_t>;
+template class HostDeviceVector<std::int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<bst_idx_t>;
-template class HostDeviceVector<uint32_t>;  // bst_feature_t
+template class HostDeviceVector<std::uint32_t>;  // bst_feature_t
+template class HostDeviceVector<RegTree::Node>;
+template class HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>;
+template class HostDeviceVector<RTreeNodeStat>;
 
 #if defined(__APPLE__) || defined(__EMSCRIPTEN__)
 /*
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 09a5ecb680fb..a996f7b89ed1 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -108,18 +108,19 @@ class PartitionBuilder {
     return {nleft_elems, nright_elems};
   }
 
-  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry,
+            typename TreeView>
   void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                  const common::Range1d range, const bst_bin_t split_cond,
                  GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
-                 const RegTree& tree, bst_idx_t const* rid) {
+                 TreeView const& tree, bst_idx_t const* rid) {
     common::Span<bst_idx_t const> rid_span{rid + range.begin(), rid + range.end()};
     common::Span<bst_idx_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
     common::Span<bst_idx_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     std::size_t nid = nodes[node_in_set].nid;
     bst_feature_t fid = tree.SplitIndex(nid);
     bool default_left = tree.DefaultLeft(nid);
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    bool is_cat = tree.SplitType(nid) == FeatureType::kCategorical;
     auto node_cats = tree.NodeCats(nid);
     auto const& cut_values = gmat.cut.Values();
 
@@ -203,15 +204,16 @@ class PartitionBuilder {
    * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
    * to go right, or if the feature value used for the split is missing.
    */
-  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry,
+            typename TreeView>
   void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                 const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                const common::ColumnMatrix& column_matrix, TreeView const& tree,
                 bst_idx_t const* rid, BitVector* decision_bits, BitVector* missing_bits) {
     common::Span<bst_idx_t const> rid_span{rid + range.begin(), rid + range.end()};
     std::size_t nid = nodes[node_in_set].nid;
     bst_feature_t fid = tree.SplitIndex(nid);
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    bool is_cat = tree.SplitType(nid) == FeatureType::kCategorical;
     auto node_cats = tree.NodeCats(nid);
     auto const& cut_values = gmat.cut.Values();
 
@@ -261,10 +263,10 @@ class PartitionBuilder {
    * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
    * use them to partition the rows accordingly.
    */
-  template <typename ExpandEntry>
+  template <typename ExpandEntry, typename TreeView>
   void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                        const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const RegTree& tree, bst_idx_t const* rid, BitVector const& decision_bits,
+                       TreeView const& tree, bst_idx_t const* rid, BitVector const& decision_bits,
                        BitVector const& missing_bits) {
     common::Span<bst_idx_t const> rid_span(rid + range.begin(), rid + range.end());
     common::Span<bst_idx_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
@@ -365,8 +367,8 @@ class PartitionBuilder {
   }
 
   // Copy row partitions into global cache for reuse in objective
-  template <typename Invalidp>
-  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
+  template <typename Invalidp, typename TreeView>
+  void LeafPartition(Context const* ctx, TreeView const& tree, RowSetCollection const& row_set,
                      Span<bst_node_t> position, Invalidp invalidp) const {
     auto p_begin = row_set.Data()->data();
     // For each node, walk through all the samples that fall in this node.
diff --git a/src/predictor/array_tree_layout.h b/src/predictor/array_tree_layout.h
index 3cbdd695019c..0332565b0c2c 100644
--- a/src/predictor/array_tree_layout.h
+++ b/src/predictor/array_tree_layout.h
@@ -11,8 +11,7 @@
 #include <type_traits>  // for conditional_t
 
 #include "../common/categorical.h"            // for IsCat
-#include "../tree/tree_view.h"                // for ScalarTreeView
-#include "xgboost/multi_target_tree_model.h"  // for MultiTargetTreeView
+#include "../tree/tree_view.h"                // for ScalarTreeView, MultiTargetTreeView
 #include "xgboost/tree_model.h"               // for RegTree
 
 namespace xgboost::predictor {
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 479e428f7ac1..03893ea4792b 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -43,10 +43,10 @@ DMLC_REGISTRY_FILE_TAG(cpu_predictor);
 
 namespace scalar {
 template <bool has_missing, bool has_categorical>
-bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
+bst_node_t GetLeafIndex(tree::ScalarTreeView const &tree, const RegTree::FVec &feat,
                         RegTree::CategoricalSplitMatrix const &cats, bst_node_t nidx) {
-  while (!tree[nidx].IsLeaf()) {
-    bst_feature_t split_index = tree[nidx].SplitIndex();
+  while (!tree.IsLeaf(nidx)) {
+    bst_feature_t split_index = tree.SplitIndex(nidx);
     auto fvalue = feat.GetFvalue(split_index);
     nidx = GetNextNode<has_missing, has_categorical>(
         tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
@@ -55,22 +55,21 @@ bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
 }
 
 template <bool has_categorical>
-[[nodiscard]] float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
+[[nodiscard]] float PredValueByOneTree(const RegTree::FVec &p_feats,
+                                       tree::ScalarTreeView const &tree,
                                        RegTree::CategoricalSplitMatrix const &cats,
                                        bst_node_t nidx) noexcept(true) {
   const bst_node_t leaf = p_feats.HasMissing()
                               ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats, nidx)
                               : GetLeafIndex<false, has_categorical>(tree, p_feats, cats, nidx);
-  return tree[leaf].LeafValue();
+  return tree.LeafValue(leaf);
 }
 
 template <bool has_categorical, bool any_missing, bool use_array_tree_layout>
-void PredValueByOneTree(const RegTree& tree,
-                        std::size_t const predict_offset,
-                        common::Span<RegTree::FVec> fvec_tloc,
-                        std::size_t const block_size,
-                        linalg::MatrixView<float> out_predt,
-                        bst_node_t* p_nidx, int depth, int gid) {
+void PredValueByOneTree(tree::ScalarTreeView const &tree, std::size_t const predict_offset,
+                        common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
+                        linalg::MatrixView<float> out_predt, bst_node_t *p_nidx, int depth,
+                        int gid) {
   auto const &cats = tree.GetCategoriesMatrix();
   if constexpr (use_array_tree_layout) {
     ProcessArrayTree<has_categorical, any_missing>(tree, fvec_tloc, block_size, p_nidx, depth);
@@ -124,9 +123,9 @@ void PredValueByOneTree(const RegTree &tree, std::size_t const predict_offset,
                         common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
                         linalg::MatrixView<float> out_predt, bst_node_t *p_nidx, bst_node_t depth) {
   const auto mt_tree = tree.HostMtView();
-  auto const &cats = tree.GetCategoriesMatrix();
+  auto const &cats = mt_tree.GetCategoriesMatrix();
   if constexpr (use_array_tree_layout) {
-    ProcessArrayTree<has_categorical, any_missing>(tree, fvec_tloc, block_size, p_nidx, depth);
+    ProcessArrayTree<has_categorical, any_missing>(mt_tree, fvec_tloc, block_size, p_nidx, depth);
   }
   for (std::size_t i = 0; i < block_size; ++i) {
     bst_node_t nidx = 0;
@@ -167,11 +166,13 @@ void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree
     } else {
       auto const gid = model.tree_info[tree_id];
       if (has_categorical) {
-        scalar::PredValueByOneTree<true, any_missing, use_array_tree_layout>
-          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth, gid);
+        scalar::PredValueByOneTree<true, any_missing, use_array_tree_layout>(
+            tree.HostScView(), predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth,
+            gid);
       } else {
-        scalar::PredValueByOneTree<false, any_missing, use_array_tree_layout>
-          (tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth, gid);
+        scalar::PredValueByOneTree<false, any_missing, use_array_tree_layout>(
+            tree.HostScView(), predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth,
+            gid);
       }
     }
   }
@@ -566,25 +567,25 @@ void PredictBatchByBlockKernel(DataView const &batch, gbm::GBTreeModel const &mo
   });
 }
 
-float FillNodeMeanValues(RegTree const *tree, bst_node_t nidx, std::vector<float> *mean_values) {
-  bst_float result;
-  auto &node = (*tree)[nidx];
+float FillNodeMeanValues(tree::ScalarTreeView const &tree, bst_node_t nidx,
+                         std::vector<float> *mean_values) {
+  float result;
   auto &node_mean_values = *mean_values;
-  if (node.IsLeaf()) {
-    result = node.LeafValue();
+  if (tree.IsLeaf(nidx)) {
+    result = tree.LeafValue(nidx);
   } else {
-    result = FillNodeMeanValues(tree, node.LeftChild(), mean_values) *
-             tree->Stat(node.LeftChild()).sum_hess;
-    result += FillNodeMeanValues(tree, node.RightChild(), mean_values) *
-              tree->Stat(node.RightChild()).sum_hess;
-    result /= tree->Stat(nidx).sum_hess;
+    result = FillNodeMeanValues(tree, tree.LeftChild(nidx), mean_values) *
+             tree.Stat(tree.LeftChild(nidx)).sum_hess;
+    result += FillNodeMeanValues(tree, tree.RightChild(nidx), mean_values) *
+              tree.Stat(tree.RightChild(nidx)).sum_hess;
+    result /= tree.Stat(nidx).sum_hess;
   }
   node_mean_values[nidx] = result;
   return result;
 }
 
-void FillNodeMeanValues(RegTree const *tree, std::vector<float> *mean_values) {
-  auto n_nodes = tree->NumNodes();
+void FillNodeMeanValues(tree::ScalarTreeView const &tree, std::vector<float> *mean_values) {
+  auto n_nodes = tree.Size();
   if (static_cast<decltype(n_nodes)>(mean_values->size()) == n_nodes) {
     return;
   }
@@ -713,18 +714,17 @@ class ColumnSplitHelper {
   }
 
   void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
-    auto const &tree = *model_.trees[tree_id];
+    auto const tree = model_.trees[tree_id]->HostScView();
     auto const &cats = tree.GetCategoriesMatrix();
-    bst_node_t n_nodes = tree.GetNodes().size();
+    bst_node_t n_nodes = tree.Size();
 
     for (bst_node_t nid = 0; nid < n_nodes; nid++) {
-      auto const &node = tree[nid];
-      if (node.IsDeleted() || node.IsLeaf()) {
+      if (tree.IsDeleted(nid) || tree.IsLeaf(nid)) {
         continue;
       }
 
       auto const bit_index = BitIndex(tree_id, row_id, nid);
-      unsigned split_index = node.SplitIndex();
+      unsigned split_index = tree.SplitIndex(nid);
       if (feat.IsMissing(split_index)) {
         missing_bits_.Set(bit_index);
         continue;
@@ -749,31 +749,32 @@ class ColumnSplitHelper {
     }
   }
 
-  bst_node_t GetNextNode(RegTree::Node const &node, std::size_t bit_index) {
+  bst_node_t GetNextNode(tree::ScalarTreeView const &tree, bst_node_t nidx, std::size_t bit_index) {
     if (missing_bits_.Check(bit_index)) {
-      return node.DefaultChild();
+      return tree.DefaultChild(nidx);
     } else {
-      return node.LeftChild() + !decision_bits_.Check(bit_index);
+      return tree.LeftChild(nidx) + !decision_bits_.Check(bit_index);
     }
   }
 
-  bst_node_t GetLeafIndex(RegTree const &tree, std::size_t tree_id, std::size_t row_id) {
+  bst_node_t GetLeafIndex(tree::ScalarTreeView const &tree, std::size_t tree_id,
+                          std::size_t row_id) {
     bst_node_t nid = 0;
-    while (!tree[nid].IsLeaf()) {
+    while (!tree.IsLeaf(nid)) {
       auto const bit_index = BitIndex(tree_id, row_id, nid);
-      nid = GetNextNode(tree[nid], bit_index);
+      nid = GetNextNode(tree, nid, bit_index);
     }
     return nid;
   }
 
   template <bool predict_leaf = false>
   bst_float PredictOneTree(std::size_t tree_id, std::size_t row_id) {
-    auto const &tree = *model_.trees[tree_id];
+    auto const tree = model_.trees[tree_id]->HostScView();
     auto const leaf = GetLeafIndex(tree, tree_id, row_id);
     if constexpr (predict_leaf) {
       return static_cast<bst_float>(leaf);
     } else {
-      return tree[leaf].LeafValue();
+      return tree.LeafValue(leaf);
     }
   }
 
@@ -944,10 +945,10 @@ class CPUPredictor : public Predictor {
             continue;
           }
           if (!approximate) {
-            CalculateContributions(*model.trees[j], feats, tree_mean_values, &this_tree_contribs[0],
-                                   condition, condition_feature);
+            CalculateContributions(model.trees[j]->HostScView(), feats, tree_mean_values,
+                                   &this_tree_contribs[0], condition, condition_feature);
           } else {
-            CalculateContributionsApprox(*model.trees[j], feats, tree_mean_values,
+            CalculateContributionsApprox(model.trees[j]->HostScView(), feats, tree_mean_values,
                                          &this_tree_contribs[0]);
           }
           for (size_t ci = 0; ci < ncolumns; ++ci) {
@@ -1058,13 +1059,15 @@ class CPUPredictor : public Predictor {
 
           for (bst_tree_t j = 0; j < ntree_limit; ++j) {
             auto const &tree = *model.trees[j];
-            auto const &cats = tree.GetCategoriesMatrix();
             bst_node_t nidx = 0;
             if (tree.IsMultiTarget()) {
-              nidx =
-                  multi::GetLeafIndex<true, true>(tree.HostMtView(), fvec_tloc.front(), cats, nidx);
+              auto mt_tree = tree.HostMtView();
+              nidx = multi::GetLeafIndex<true, true>(mt_tree, fvec_tloc.front(),
+                                                     mt_tree.GetCategoriesMatrix(), nidx);
             } else {
-              nidx = scalar::GetLeafIndex<true, true>(tree, fvec_tloc.front(), cats, nidx);
+              auto sc_tree = tree.HostScView();
+              nidx = scalar::GetLeafIndex<true, true>(tree.HostScView(), fvec_tloc.front(),
+                                                      sc_tree.GetCategoriesMatrix(), nidx);
             }
             preds[ridx * ntree_limit + j] = static_cast<float>(nidx);
           }
@@ -1097,7 +1100,7 @@ class CPUPredictor : public Predictor {
     // initialize tree node mean values
     std::vector<std::vector<float>> mean_values(ntree_limit);
     common::ParallelFor(ntree_limit, n_threads, [&](bst_omp_uint i) {
-      FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
+      FillNodeMeanValues(model.trees[i]->HostScView(), &(mean_values[i]));
     });
 
     LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 7e04305be20a..ad47eb5ec86c 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -472,7 +472,7 @@ class DeviceModel {
     std::vector<FeatureType>& h_split_types = split_types.HostVector();
     h_split_types.resize(h_tree_segments.back());
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      auto const& src_st = model.trees.at(tree_idx)->GetSplitTypes();
+      auto const& src_st = model.trees.at(tree_idx)->GetSplitTypes(DeviceOrd::CPU());
       std::copy(src_st.cbegin(), src_st.cend(),
                 h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
     }
@@ -482,7 +482,7 @@ class DeviceModel {
     std::vector<uint32_t>& h_categories = categories.HostVector();
     std::vector<uint32_t>& h_split_cat_segments = categories_tree_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      auto const& src_cats = model.trees.at(tree_idx)->GetSplitCategories();
+      auto const& src_cats = model.trees.at(tree_idx)->GetSplitCategories(DeviceOrd::CPU());
       size_t orig_size = h_categories.size();
       h_categories.resize(orig_size + src_cats.size());
       std::copy(src_cats.cbegin(), src_cats.cend(), h_categories.begin() + orig_size);
diff --git a/src/predictor/treeshap.cc b/src/predictor/treeshap.cc
index 0e730524ada1..bae297c973a8 100644
--- a/src/predictor/treeshap.cc
+++ b/src/predictor/treeshap.cc
@@ -6,37 +6,38 @@
 #include <algorithm>  // copy
 #include <cstdint>    // std::uint32_t
 
-#include "predict_fn.h"    // GetNextNode
-#include "xgboost/base.h"  // bst_node_t
+#include "../tree/tree_view.h"  // for ScalarTreeView
+#include "predict_fn.h"         // GetNextNode
+#include "xgboost/base.h"       // bst_node_t
 #include "xgboost/logging.h"
 #include "xgboost/tree_model.h"  // RegTree
 
 namespace xgboost {
-void CalculateContributionsApprox(RegTree const& tree, const RegTree::FVec& feat,
+void CalculateContributionsApprox(tree::ScalarTreeView const& tree, const RegTree::FVec& feat,
                                   std::vector<float>* mean_values, float* out_contribs) {
   CHECK_GT(mean_values->size(), 0U);
   bst_feature_t split_index = 0;
   // update bias value
   float node_value = (*mean_values)[0];
   out_contribs[feat.Size()] += node_value;
-  if (tree[0].IsLeaf()) {
+  if (tree.IsLeaf(RegTree::kRoot)) {
     // nothing to do anymore
     return;
   }
 
-  bst_node_t nid = 0;
+  bst_node_t nidx = 0;
   auto const& cats = tree.GetCategoriesMatrix();
 
-  while (!tree[nid].IsLeaf()) {
-    split_index = tree[nid].SplitIndex();
-    nid = predictor::GetNextNode<true, true>(tree, nid, feat.GetFvalue(split_index),
-                                             feat.IsMissing(split_index), cats);
-    bst_float new_value = (*mean_values)[nid];
+  while (!tree.IsLeaf(nidx)) {
+    split_index = tree.SplitIndex(nidx);
+    nidx = predictor::GetNextNode<true, true>(tree, nidx, feat.GetFvalue(split_index),
+                                              feat.IsMissing(split_index), cats);
+    bst_float new_value = (*mean_values)[nidx];
     // update feature weight
     out_contribs[split_index] += new_value - node_value;
     node_value = new_value;
   }
-  bst_float leaf_value = tree[nid].LeafValue();
+  float leaf_value = tree.LeafValue(nidx);
   // update leaf feature weight
   out_contribs[split_index] += leaf_value - node_value;
 }
@@ -136,12 +137,10 @@ float UnwoundPathSum(const PathElement* unique_path, std::uint32_t unique_depth,
  * \param condition_feature the index of the feature to fix
  * \param condition_fraction what fraction of the current weight matches our conditioning feature
  */
-void TreeShap(RegTree const& tree, const RegTree::FVec& feat, float* phi, bst_node_t node_index,
-              std::uint32_t unique_depth, PathElement* parent_unique_path,
+void TreeShap(tree::ScalarTreeView const& tree, const RegTree::FVec& feat, float* phi,
+              bst_node_t nidx, std::uint32_t unique_depth, PathElement* parent_unique_path,
               float parent_zero_fraction, float parent_one_fraction, int parent_feature_index,
               int condition, std::uint32_t condition_feature, float condition_fraction) {
-  const auto node = tree[node_index];
-
   // stop if we have no weight coming down to us
   if (condition_fraction == 0) return;
 
@@ -153,15 +152,15 @@ void TreeShap(RegTree const& tree, const RegTree::FVec& feat, float* phi, bst_no
     ExtendPath(unique_path, unique_depth, parent_zero_fraction, parent_one_fraction,
                parent_feature_index);
   }
-  const std::uint32_t split_index = node.SplitIndex();
+  const std::uint32_t split_index = tree.SplitIndex(nidx);
 
   // leaf node
-  if (node.IsLeaf()) {
+  if (tree.IsLeaf(nidx)) {
     for (std::uint32_t i = 1; i <= unique_depth; ++i) {
       const float w = UnwoundPathSum(unique_path, unique_depth, i);
       const PathElement& el = unique_path[i];
       phi[el.feature_index] +=
-          w * (el.one_fraction - el.zero_fraction) * node.LeafValue() * condition_fraction;
+          w * (el.one_fraction - el.zero_fraction) * tree.LeafValue(nidx) * condition_fraction;
     }
 
     // internal node
@@ -169,10 +168,11 @@ void TreeShap(RegTree const& tree, const RegTree::FVec& feat, float* phi, bst_no
     // find which branch is "hot" (meaning x would follow it)
     auto const& cats = tree.GetCategoriesMatrix();
     bst_node_t hot_index = predictor::GetNextNode<true, true>(
-        tree, node_index, feat.GetFvalue(split_index), feat.IsMissing(split_index), cats);
+        tree, nidx, feat.GetFvalue(split_index), feat.IsMissing(split_index), cats);
 
-    const auto cold_index = (hot_index == node.LeftChild() ? node.RightChild() : node.LeftChild());
-    const float w = tree.Stat(node_index).sum_hess;
+    const auto cold_index =
+        (hot_index == tree.LeftChild(nidx) ? tree.RightChild(nidx) : tree.LeftChild(nidx));
+    const float w = tree.Stat(nidx).sum_hess;
     const float hot_zero_fraction = tree.Stat(hot_index).sum_hess / w;
     const float cold_zero_fraction = tree.Stat(cold_index).sum_hess / w;
     float incoming_zero_fraction = 1;
@@ -213,7 +213,7 @@ void TreeShap(RegTree const& tree, const RegTree::FVec& feat, float* phi, bst_no
   }
 }
 
-void CalculateContributions(RegTree const& tree, const RegTree::FVec& feat,
+void CalculateContributions(tree::ScalarTreeView const& tree, const RegTree::FVec& feat,
                             std::vector<float>* mean_values, float* out_contribs, int condition,
                             std::uint32_t condition_feature) {
   // find the expected value of the tree's predictions
@@ -223,7 +223,7 @@ void CalculateContributions(RegTree const& tree, const RegTree::FVec& feat,
   }
 
   // Preallocate space for the unique path data
-  const int maxd = tree.MaxDepth(0) + 2;
+  bst_node_t const maxd = tree.MaxDepth() + 2;
   std::vector<PathElement> unique_path_data((maxd * (maxd + 1)) / 2);
 
   TreeShap(tree, feat, out_contribs, 0, 0, unique_path_data.data(), 1, 1, -1, condition,
diff --git a/src/predictor/treeshap.h b/src/predictor/treeshap.h
index 4383b90be3a6..69423dd9d4bd 100644
--- a/src/predictor/treeshap.h
+++ b/src/predictor/treeshap.h
@@ -16,17 +16,18 @@ namespace xgboost {
  * @param feat dense feature vector, if the feature is missing the field is set to NaN
  * @param out_contribs output vector to hold the contributions
  */
-void CalculateContributionsApprox(RegTree const& tree, const RegTree::FVec& feat,
+void CalculateContributionsApprox(tree::ScalarTreeView const& tree, const RegTree::FVec& feat,
                                   std::vector<float>* mean_values, float* out_contribs);
 
 /**
- * \brief calculate the feature contributions (https://arxiv.org/abs/1706.06060) for the tree
- * \param feat dense feature vector, if the feature is missing the field is set to NaN
- * \param out_contribs output vector to hold the contributions
- * \param condition fix one feature to either off (-1) on (1) or not fixed (0 default)
- * \param condition_feature the index of the feature to fix
+ * @brief calculate the feature contributions (https://arxiv.org/abs/1706.06060) for the tree
+ *
+ * @param feat dense feature vector, if the feature is missing the field is set to NaN
+ * @param out_contribs output vector to hold the contributions
+ * @param condition fix one feature to either off (-1) on (1) or not fixed (0 default)
+ * @param condition_feature the index of the feature to fix
  */
-void CalculateContributions(RegTree const& tree, const RegTree::FVec& feat,
+void CalculateContributions(tree::ScalarTreeView const& tree, const RegTree::FVec& feat,
                             std::vector<float>* mean_values, float* out_contribs, int condition,
                             unsigned condition_feature);
 }  // namespace xgboost
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index 45926b4ea22d..d6393f730523 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -18,6 +18,7 @@
 #include "../common/partition_builder.h"  // for PartitionBuilder
 #include "../common/row_set.h"            // for RowSetCollection
 #include "../common/threading_utils.h"    // for ParallelFor2d
+#include "tree_view.h"                    // for ScalarTreeView
 #include "xgboost/base.h"                 // for bst_idx_t
 #include "xgboost/collective/result.h"    // for Success, SafeColl
 #include "xgboost/context.h"              // for Context
@@ -43,11 +44,12 @@ class ColumnSplitHelper {
     missing_bits_ = BitVector{common::Span<BitVector::value_type>{missing_storage_}};
   }
 
-  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry,
+            typename TreeView>
   void Partition(Context const* ctx, common::BlockedSpace2d const& space, std::int32_t n_threads,
                  GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
                  std::vector<ExpandEntry> const& nodes,
-                 std::vector<std::int32_t> const& split_conditions, RegTree const* p_tree) {
+                 std::vector<std::int32_t> const& split_conditions, TreeView const& tree) {
     // When data is split by column, we don't have all the feature values in the local worker, so
     // we first collect all the decisions and whether the feature is missing into bit vectors.
     std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
@@ -74,7 +76,7 @@ class ColumnSplitHelper {
       auto missing = make_tloc(this->tloc_missing_, tidx);
       bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
       partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
-          node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
+          node_in_set, nodes, r, split_cond, gmat, column_matrix, tree,
           (*row_set_collection_)[nid].begin(), &decision, &missing);
     });
 
@@ -103,7 +105,7 @@ class ColumnSplitHelper {
       const int32_t nid = nodes[node_in_set].nid;
       const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
       partition_builder_->AllocateForTask(task_id);
-      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
+      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, tree,
                                           (*row_set_collection_)[nid].begin(), decision_bits_,
                                           missing_bits_);
     });
@@ -153,8 +155,8 @@ class CommonRowPartitioner {
   }
 
   /* Making GHistIndexMatrix_t a templete parameter allows reuse this function for sycl-plugin */
-  template <typename ExpandEntry, typename GHistIndexMatrixT>
-  static void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
+  template <typename ExpandEntry, typename GHistIndexMatrixT, typename TreeView>
+  static void FindSplitConditions(const std::vector<ExpandEntry>& nodes, TreeView const& tree,
                                   GHistIndexMatrixT const& gmat,
                                   std::vector<int32_t>* p_split_conditions) {
     auto const& ptrs = gmat.cut.Ptrs();
@@ -180,28 +182,28 @@ class CommonRowPartitioner {
     }
   }
 
-  template <typename ExpandEntry>
-  void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree const* p_tree) {
+  template <typename ExpandEntry, typename TreeView>
+  void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, TreeView const& tree) {
     const size_t n_nodes = nodes.size();
     for (unsigned int i = 0; i < n_nodes; ++i) {
       const int32_t nidx = nodes[i].nid;
       const size_t n_left = partition_builder_.GetNLeftElems(i);
       const size_t n_right = partition_builder_.GetNRightElems(i);
-      CHECK_EQ(p_tree->LeftChild(nidx) + 1, p_tree->RightChild(nidx));
-      row_set_collection_.AddSplit(nidx, p_tree->LeftChild(nidx), p_tree->RightChild(nidx), n_left,
+      CHECK_EQ(tree.LeftChild(nidx) + 1, tree.RightChild(nidx));
+      row_set_collection_.AddSplit(nidx, tree.LeftChild(nidx), tree.RightChild(nidx), n_left,
                                    n_right);
     }
   }
 
-  template <typename ExpandEntry>
+  template <typename ExpandEntry, typename TreeView>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
-                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, TreeView const& tree) {
     auto const& column_matrix = gmat.Transpose();
     if (column_matrix.IsInitialized()) {
       if (gmat.cut.HasCategorical()) {
-        this->template UpdatePosition<true>(ctx, gmat, column_matrix, nodes, p_tree);
+        this->template UpdatePosition<true>(ctx, gmat, column_matrix, nodes, tree);
       } else {
-        this->template UpdatePosition<false>(ctx, gmat, column_matrix, nodes, p_tree);
+        this->template UpdatePosition<false>(ctx, gmat, column_matrix, nodes, tree);
       }
     } else {
       /* ColumnMatrix is not initilized.
@@ -209,43 +211,43 @@ class CommonRowPartitioner {
        * any_missing and any_cat don't metter in this case.
        * Jump directly to the main method.
        */
-      this->template UpdatePosition<uint8_t, true, true>(ctx, gmat, column_matrix, nodes, p_tree);
+      this->template UpdatePosition<uint8_t, true, true>(ctx, gmat, column_matrix, nodes, tree);
     }
   }
 
-  template <bool any_cat, typename ExpandEntry>
+  template <bool any_cat, typename ExpandEntry, typename TreeView>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, TreeView const& tree) {
     if (column_matrix.AnyMissing()) {
-      this->template UpdatePosition<true, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
+      this->template UpdatePosition<true, any_cat>(ctx, gmat, column_matrix, nodes, tree);
     } else {
-      this->template UpdatePosition<false, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
+      this->template UpdatePosition<false, any_cat>(ctx, gmat, column_matrix, nodes, tree);
     }
   }
 
-  template <bool any_missing, bool any_cat, typename ExpandEntry>
+  template <bool any_missing, bool any_cat, typename ExpandEntry, typename TreeView>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, TreeView const& tree) {
     common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto t) {
       using T = decltype(t);
-      this->template UpdatePosition<T, any_missing, any_cat>(ctx, gmat, column_matrix, nodes,
-                                                             p_tree);
+      this->template UpdatePosition<T, any_missing, any_cat>(ctx, gmat, column_matrix, nodes, tree);
     });
   }
 
-  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry,
+            typename TreeView>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, TreeView const& tree) {
     // 1. Find split condition for each split
     size_t n_nodes = nodes.size();
 
     std::vector<bst_bin_t> split_conditions;
     if (column_matrix.IsInitialized()) {
       split_conditions.resize(n_nodes);
-      FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
+      FindSplitConditions(nodes, tree, gmat, &split_conditions);
     }
 
     // 2.1 Create a blocked space of size SUM(samples in each node)
@@ -271,7 +273,7 @@ class CommonRowPartitioner {
     // Store results in intermediate buffers from partition_builder_
     if (is_col_split_) {
       column_split_helper_.Partition<BinIdxType, any_missing, any_cat>(
-          ctx, space, ctx->Threads(), gmat, column_matrix, nodes, split_conditions, p_tree);
+          ctx, space, ctx->Threads(), gmat, column_matrix, nodes, split_conditions, tree);
     } else {
       common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
         size_t begin = r.begin();
@@ -280,7 +282,7 @@ class CommonRowPartitioner {
         partition_builder_.AllocateForTask(task_id);
         bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
         partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
-            node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
+            node_in_set, nodes, r, split_cond, gmat, column_matrix, tree,
             row_set_collection_[nid].begin());
       });
     }
@@ -297,7 +299,7 @@ class CommonRowPartitioner {
     });
 
     // 5. Add info about splits into row_set_collection_
-    AddSplitsToRowSet(nodes, p_tree);
+    AddSplitsToRowSet(nodes, tree);
   }
 
   [[nodiscard]] auto const& Partitions() const { return row_set_collection_; }
@@ -309,15 +311,16 @@ class CommonRowPartitioner {
   auto& operator[](bst_node_t nidx) { return row_set_collection_[nidx]; }
   auto const& operator[](bst_node_t nidx) const { return row_set_collection_[nidx]; }
 
-  void LeafPartition(Context const* ctx, RegTree const& tree, common::Span<float const> hess,
+  void LeafPartition(Context const* ctx, ScalarTreeView const& tree, common::Span<float const> hess,
                      common::Span<bst_node_t> out_position) const {
     partition_builder_.LeafPartition(
         ctx, tree, this->Partitions(), out_position,
         [&](size_t idx) -> bool { return hess[idx - this->base_rowid] - .0f == .0f; });
   }
 
-  void LeafPartition(Context const* ctx, RegTree const& tree,
-                     linalg::TensorView<GradientPair const, 2> gpair,
+  template <typename TreeView>
+  void LeafPartition(Context const* ctx, TreeView const& tree,
+                     linalg::MatrixView<GradientPair const> gpair,
                      common::Span<bst_node_t> out_position) const {
     if (gpair.Shape(1) > 1) {
       partition_builder_.LeafPartition(
@@ -334,14 +337,6 @@ class CommonRowPartitioner {
                                        });
     }
   }
-  void LeafPartition(Context const* ctx, RegTree const& tree,
-                     common::Span<GradientPair const> gpair,
-                     common::Span<bst_node_t> out_position) const {
-    partition_builder_.LeafPartition(ctx, tree, this->Partitions(), out_position,
-                                     [&](std::size_t idx) -> bool {
-                                       return gpair[idx - this->base_rowid].GetHess() - .0f == .0f;
-                                     });
-  }
 
  private:
   common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index a260784ad1db..61d9ef34798f 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2025, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
@@ -20,6 +20,7 @@
 #include "../constraints.h"            // for FeatureInteractionConstraintHost
 #include "../param.h"                  // for TrainParam
 #include "../split_evaluator.h"        // for TreeEvaluator
+#include "../tree_view.h"              // for MultiTargetTreeView
 #include "expand_entry.h"              // for MultiExpandEntry
 #include "hist_cache.h"                // for BoundedHistCollection
 #include "xgboost/base.h"              // for bst_node_t, bst_target_t, bst_feature_t
@@ -679,13 +680,15 @@ class HistMultiEvaluator {
 
     p_tree->ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
                        candidate.split.DefaultLeft(), base_weight, left_weight, right_weight);
+
     CHECK(p_tree->IsMultiTarget());
-    auto left_child = p_tree->LeftChild(candidate.nid);
+    auto mt_tree = p_tree->HostMtView();
+    auto left_child = mt_tree.LeftChild(candidate.nid);
     CHECK_GT(left_child, candidate.nid);
-    auto right_child = p_tree->RightChild(candidate.nid);
+    auto right_child = mt_tree.RightChild(candidate.nid);
     CHECK_GT(right_child, candidate.nid);
 
-    std::size_t n_nodes = p_tree->Size();
+    std::size_t n_nodes = mt_tree.Size();
     gain_.resize(n_nodes);
     // Re-calculate weight without learning rate.
     CalcWeight(*param_, left_sum, left_weight);
@@ -725,20 +728,18 @@ class HistMultiEvaluator {
  * \param p_last_tree The last tree being updated by tree updater
  */
 template <typename Partitioner>
-void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
+void UpdatePredictionCacheImpl(Context const *ctx, ScalarTreeView const &last_tree,
                                std::vector<Partitioner> const &partitioner,
                                linalg::VectorView<float> out_preds) {
-  auto const &tree = *p_last_tree;
   CHECK(out_preds.Device().IsCPU());
-  size_t n_nodes = p_last_tree->GetNodes().size();
+  size_t n_nodes = last_tree.Size();
   for (auto &part : partitioner) {
     CHECK_EQ(part.Size(), n_nodes);
-    common::BlockedSpace2d space(
-        part.Size(), [&](size_t node) { return part[node].Size(); }, 1024);
+    common::BlockedSpace2d space(part.Size(), [&](size_t node) { return part[node].Size(); }, 1024);
     common::ParallelFor2d(space, ctx->Threads(), [&](bst_node_t nidx, common::Range1d r) {
-      if (!tree[nidx].IsDeleted() && tree[nidx].IsLeaf()) {
+      if (!last_tree.IsDeleted(nidx) && last_tree.IsLeaf(nidx)) {
         auto const &rowset = part[nidx];
-        auto leaf_value = tree[nidx].LeafValue();
+        auto leaf_value = last_tree.LeafValue(nidx);
         for (auto const *it = rowset.begin() + r.begin(); it < rowset.begin() + r.end(); ++it) {
           out_preds(*it) += leaf_value;
         }
@@ -756,13 +757,13 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
 
   auto const &tree = *p_last_tree;
   if (!tree.IsMultiTarget()) {
-    UpdatePredictionCacheImpl(ctx, p_last_tree, partitioner, out_preds.Slice(linalg::All(), 0));
-    return;
+    return UpdatePredictionCacheImpl(ctx, p_last_tree->HostScView(), partitioner,
+                                     out_preds.Slice(linalg::All(), 0));
   }
 
-  auto const *mttree = tree.GetMultiTargetTree();
-  auto n_nodes = mttree->Size();
-  auto n_targets = tree.NumTargets();
+  auto const mt_tree = tree.HostMtView();
+  auto n_nodes = mt_tree.Size();
+  auto n_targets = mt_tree.NumTargets();
   CHECK_EQ(out_preds.Shape(1), n_targets);
   CHECK(out_preds.Device().IsCPU());
 
@@ -771,9 +772,9 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
     common::BlockedSpace2d space(
         part.Size(), [&](size_t node) { return part[node].Size(); }, 1024);
     common::ParallelFor2d(space, ctx->Threads(), [&](bst_node_t nidx, common::Range1d r) {
-      if (tree.IsLeaf(nidx)) {
+      if (mt_tree.IsLeaf(nidx)) {
         auto const &rowset = part[nidx];
-        auto leaf_value = mttree->LeafValue(nidx);
+        auto leaf_value = mt_tree.LeafValue(nidx);
         for (bst_idx_t const *it = rowset.begin() + r.begin(); it < rowset.begin() + r.end();
              ++it) {
           for (std::size_t i = 0; i < n_targets; ++i) {
diff --git a/src/tree/hist/histogram.cc b/src/tree/hist/histogram.cc
index 96abc039cf5d..b90eaad764e5 100644
--- a/src/tree/hist/histogram.cc
+++ b/src/tree/hist/histogram.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include "histogram.h"
 
@@ -9,20 +9,22 @@
 #include <vector>   // for vector
 
 #include "../../common/transform_iterator.h"  // for MakeIndexTransformIter
+#include "../tree_view.h"                     // for ScalarTreeView, MultiTargetTreeView
 #include "expand_entry.h"                     // for MultiExpandEntry, CPUExpandEntry
-#include "xgboost/logging.h"                  // for CHECK_NE
+#include "xgboost/logging.h"                  // for CHECK_EQ
 #include "xgboost/span.h"                     // for Span
 #include "xgboost/tree_model.h"               // for RegTree
 
 namespace xgboost::tree {
-void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
+void AssignNodes(MultiTargetTreeView const &tree,
+                 std::vector<MultiExpandEntry> const &valid_candidates,
                  common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
   CHECK_EQ(nodes_to_build.size(), valid_candidates.size());
 
   std::size_t n_idx = 0;
   for (auto const &c : valid_candidates) {
-    auto left_nidx = p_tree->LeftChild(c.nid);
-    auto right_nidx = p_tree->RightChild(c.nid);
+    auto left_nidx = tree.LeftChild(c.nid);
+    auto right_nidx = tree.RightChild(c.nid);
 
     auto build_nidx = left_nidx;
     auto subtract_nidx = right_nidx;
@@ -42,12 +44,12 @@ void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &val
   }
 }
 
-void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
+void AssignNodes(ScalarTreeView const &tree, std::vector<CPUExpandEntry> const &candidates,
                  common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
   std::size_t n_idx = 0;
   for (auto const &c : candidates) {
-    auto left_nidx = (*p_tree)[c.nid].LeftChild();
-    auto right_nidx = (*p_tree)[c.nid].RightChild();
+    auto left_nidx = tree.LeftChild(c.nid);
+    auto right_nidx = tree.RightChild(c.nid);
     auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
 
     auto build_nidx = left_nidx;
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index b41b07365c4b..9044afa66004 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -30,13 +30,14 @@ namespace xgboost::tree {
 /**
  * @brief Decide which node as the build node for multi-target trees.
  */
-void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
+void AssignNodes(MultiTargetTreeView const &tree,
+                 std::vector<MultiExpandEntry> const &valid_candidates,
                  common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
 
 /**
  * @brief Decide which node as the build node.
  */
-void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
+void AssignNodes(ScalarTreeView const &tree, std::vector<CPUExpandEntry> const &candidates,
                  common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
 
 class HistogramBuilder {
@@ -93,7 +94,8 @@ class HistogramBuilder {
    * @brief Allocate histogram, rearrange the nodes if `rearrange` is true and the tree
    *        has reached the cache size limit.
    */
-  void AddHistRows(RegTree const *p_tree, std::vector<bst_node_t> *p_nodes_to_build,
+  template <typename TreeView>
+  void AddHistRows(TreeView const &tree, std::vector<bst_node_t> *p_nodes_to_build,
                    std::vector<bst_node_t> *p_nodes_to_sub, bool rearrange) {
     CHECK(p_nodes_to_build);
     auto &nodes_to_build = *p_nodes_to_build;
@@ -130,7 +132,7 @@ class HistogramBuilder {
     // saved memory.
     std::vector<bst_node_t> can_subtract;
     for (auto const &v : nodes_to_sub) {
-      if (this->hist_.HistogramExists(p_tree->Parent(v))) {
+      if (this->hist_.HistogramExists(tree.Parent(v))) {
         // We can still use the subtraction trick for this node
         can_subtract.push_back(v);
       } else {
@@ -172,7 +174,8 @@ class HistogramBuilder {
     monitor_.Stop(__func__);
   }
 
-  void SyncHistogram(Context const *ctx, RegTree const *p_tree,
+  template <typename TreeView>
+  void SyncHistogram(Context const *ctx, TreeView const &tree,
                      std::vector<bst_node_t> const &nodes_to_build,
                      std::vector<bst_node_t> const &nodes_to_trick) {
     auto n_total_bins = buffer_.TotalBins();
@@ -201,9 +204,9 @@ class HistogramBuilder {
     common::ParallelFor2d(
         subspace, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
           auto subtraction_nidx = nodes_to_trick[nidx_in_set];
-          auto parent_id = p_tree->Parent(subtraction_nidx);
-          auto sibling_nidx = p_tree->IsLeftChild(subtraction_nidx) ? p_tree->RightChild(parent_id)
-                                                                    : p_tree->LeftChild(parent_id);
+          auto parent_id = tree.Parent(subtraction_nidx);
+          auto sibling_nidx = tree.IsLeftChild(subtraction_nidx) ? tree.RightChild(parent_id)
+                                                                 : tree.LeftChild(parent_id);
           auto sibling_hist = this->hist_[sibling_nidx];
           auto parent_hist = this->hist_[parent_id];
           auto subtract_hist = this->hist_[subtraction_nidx];
@@ -251,12 +254,12 @@ class MultiHistogramBuilder {
   /**
    * @brief Build the histogram for root node.
    */
-  template <typename Partitioner, typename ExpandEntry>
-  void BuildRootHist(DMatrix *p_fmat, RegTree const *p_tree,
+  template <typename Partitioner, typename ExpandEntry, typename TreeView>
+  void BuildRootHist(DMatrix *p_fmat, TreeView const &tree,
                      std::vector<Partitioner> const &partitioners,
                      linalg::MatrixView<GradientPair const> gpair, ExpandEntry const &best,
                      BatchParam const &param, bool force_read_by_column = false) {
-    auto n_targets = p_tree->NumTargets();
+    auto n_targets = tree.NumTargets();
     CHECK_EQ(gpair.Shape(1), n_targets);
     CHECK_EQ(p_fmat->Info().num_row_, gpair.Shape(0));
     CHECK_EQ(target_builders_.size(), n_targets);
@@ -265,7 +268,7 @@ class MultiHistogramBuilder {
 
     auto space = ConstructHistSpace(partitioners, nodes);
     for (bst_target_t t{0}; t < n_targets; ++t) {
-      this->target_builders_[t].AddHistRows(p_tree, &nodes, &dummy_sub, false);
+      this->target_builders_[t].AddHistRows(tree, &nodes, &dummy_sub, false);
     }
     CHECK(dummy_sub.empty());
 
@@ -280,38 +283,38 @@ class MultiHistogramBuilder {
       ++page_idx;
     }
 
-    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(ctx_, p_tree, nodes, dummy_sub);
+    for (bst_target_t t = 0; t < n_targets; ++t) {
+      this->target_builders_[t].SyncHistogram(ctx_, tree, nodes, dummy_sub);
     }
   }
   /**
    * @brief Build histogram for left and right child of valid candidates
    */
-  template <typename Partitioner, typename ExpandEntry>
-  void BuildHistLeftRight(Context const *ctx, DMatrix *p_fmat, RegTree const *p_tree,
+  template <typename Partitioner, typename ExpandEntry, typename TreeView>
+  void BuildHistLeftRight(Context const *ctx, DMatrix *p_fmat, TreeView const &tree,
                           std::vector<Partitioner> const &partitioners,
                           std::vector<ExpandEntry> const &valid_candidates,
                           linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
                           bool force_read_by_column = false) {
     std::vector<bst_node_t> nodes_to_build(valid_candidates.size());
     std::vector<bst_node_t> nodes_to_sub(valid_candidates.size());
-    AssignNodes(p_tree, valid_candidates, nodes_to_build, nodes_to_sub);
+    AssignNodes(tree, valid_candidates, nodes_to_build, nodes_to_sub);
 
     // use the first builder for getting number of valid nodes.
-    target_builders_.front().AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, true);
+    target_builders_.front().AddHistRows(tree, &nodes_to_build, &nodes_to_sub, true);
     CHECK_GE(nodes_to_build.size(), nodes_to_sub.size());
     CHECK_EQ(nodes_to_sub.size() + nodes_to_build.size(), valid_candidates.size() * 2);
 
     // allocate storage for the rest of the builders
     for (bst_target_t t = 1; t < target_builders_.size(); ++t) {
-      target_builders_[t].AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, false);
+      target_builders_[t].AddHistRows(tree, &nodes_to_build, &nodes_to_sub, false);
     }
 
     auto space = ConstructHistSpace(partitioners, nodes_to_build);
     std::size_t page_idx{0};
     for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
-      CHECK_EQ(gpair.Shape(1), p_tree->NumTargets());
-      for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
+      CHECK_EQ(gpair.Shape(1), tree.NumTargets());
+      for (bst_target_t t = 0; t < tree.NumTargets(); ++t) {
         auto t_gpair = gpair.Slice(linalg::All(), t);
         CHECK_EQ(t_gpair.Shape(0), p_fmat->Info().num_row_);
         this->target_builders_[t].BuildHist(page_idx, space, page,
@@ -321,8 +324,8 @@ class MultiHistogramBuilder {
       page_idx++;
     }
 
-    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(ctx, p_tree, nodes_to_build, nodes_to_sub);
+    for (bst_target_t t = 0; t < tree.NumTargets(); ++t) {
+      this->target_builders_[t].SyncHistogram(ctx, tree, nodes_to_build, nodes_to_sub);
     }
   }
 
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 96305ae9a64c..e6d83e181308 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -793,8 +793,10 @@ bool RegTree::Equal(const RegTree& b) const {
   auto const& self = *this;
   bool ret { true };
   auto sc_tree = this->HostScView();
-  sc_tree.WalkTree([&self, &b, &ret](bst_node_t nidx) {
-    if (!(self.nodes_.at(nidx) == b.nodes_.at(nidx))) {
+  auto const& lhs = self.nodes_.ConstHostVector();
+  auto const& rhs = b.nodes_.ConstHostVector();
+  sc_tree.WalkTree([&](bst_node_t nidx) {
+    if (!(lhs.at(nidx) == rhs.at(nidx))) {
       ret = false;
       return false;
     }
@@ -832,6 +834,13 @@ bool RegTree::Equal(const RegTree& b) const {
   return this->HostScView().GetDepth(nidx);
 }
 
+[[nodiscard]] bst_node_t RegTree::MaxDepth() const {
+  if (this->IsMultiTarget()) {
+    return this->HostMtView().MaxDepth(RegTree::kRoot);
+  }
+  return this->HostScView().MaxDepth(RegTree::kRoot);
+}
+
 void RegTree::ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value,
                          bool default_left, bst_float base_weight,
                          bst_float left_leaf_weight,
@@ -841,22 +850,24 @@ void RegTree::ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_v
   CHECK(!IsMultiTarget());
   int pleft = this->AllocNode();
   int pright = this->AllocNode();
-  auto &node = nodes_[nid];
+  auto& h_nodes = nodes_.HostVector();
+
+  auto &node = h_nodes[nid];
   CHECK(node.IsLeaf());
   node.SetLeftChild(pleft);
   node.SetRightChild(pright);
-  nodes_[node.LeftChild()].SetParent(nid, true);
-  nodes_[node.RightChild()].SetParent(nid, false);
+  h_nodes[node.LeftChild()].SetParent(nid, true);
+  h_nodes[node.RightChild()].SetParent(nid, false);
   node.SetSplit(split_index, split_value, default_left);
 
-  nodes_[pleft].SetLeaf(left_leaf_weight, leaf_right_child);
-  nodes_[pright].SetLeaf(right_leaf_weight, leaf_right_child);
+  h_nodes[pleft].SetLeaf(left_leaf_weight, leaf_right_child);
+  h_nodes[pright].SetLeaf(right_leaf_weight, leaf_right_child);
 
   this->Stat(nid) = {loss_change, sum_hess, base_weight};
   this->Stat(pleft) = {0.0f, left_sum, left_leaf_weight};
   this->Stat(pright) = {0.0f, right_sum, right_leaf_weight};
 
-  this->split_types_.at(nid) = FeatureType::kNumerical;
+  this->split_types_.HostVector().at(nid) = FeatureType::kNumerical;
 }
 
 void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond,
@@ -871,9 +882,9 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
   this->p_mt_tree_->Expand(nidx, split_index, split_cond, default_left, base_weight, left_weight,
                            right_weight);
 
-  split_types_.resize(this->Size(), FeatureType::kNumerical);
-  split_categories_segments_.resize(this->Size());
-  this->split_types_.at(nidx) = FeatureType::kNumerical;
+  split_types_.HostVector().resize(this->Size(), FeatureType::kNumerical);
+  split_categories_segments_.HostVector().resize(this->Size());
+  this->split_types_.HostVector().at(nidx) = FeatureType::kNumerical;
 
   this->param_.num_nodes = this->p_mt_tree_->Size();
 }
@@ -888,13 +899,17 @@ void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
   this->ExpandNode(nidx, split_index, DftBadValue(), default_left, base_weight, left_leaf_weight,
                    right_leaf_weight, loss_change, sum_hess, left_sum, right_sum);
 
-  size_t orig_size = split_categories_.size();
-  this->split_categories_.resize(orig_size + split_cat.size());
+  auto& h_split_categories = split_categories_.HostVector();
+  std::size_t orig_size = h_split_categories.size();
+  h_split_categories.resize(orig_size + split_cat.size());
   std::copy(split_cat.data(), split_cat.data() + split_cat.size(),
-            split_categories_.begin() + orig_size);
-  this->split_types_.at(nidx) = FeatureType::kCategorical;
-  this->split_categories_segments_.at(nidx).beg = orig_size;
-  this->split_categories_segments_.at(nidx).size = split_cat.size();
+            h_split_categories.begin() + orig_size);
+
+  this->split_types_.HostVector().at(nidx) = FeatureType::kCategorical;
+
+  auto& h_split_categories_segments = this->split_categories_segments_.HostVector();
+  h_split_categories_segments.at(nidx).beg = orig_size;
+  h_split_categories_segments.at(nidx).size = split_cat.size();
 }
 
 RegTree* RegTree::Copy() const {
@@ -902,7 +917,9 @@ RegTree* RegTree::Copy() const {
   ptr->param_ = this->param_;
 
   auto copy = [](auto* lhs, auto const& rhs) {
-    *lhs = rhs;
+    lhs->SetDevice(rhs.Device());
+    lhs->Resize(rhs.Size());
+    lhs->Copy(rhs);
   };
 
   copy(&ptr->nodes_, this->nodes_);
@@ -939,10 +956,14 @@ void RegTree::LoadCategoricalSplit(Json const& in) {
   // `categories_segments' is only available for categorical nodes to prevent overhead for
   // numerical node. As a result, we need to track the categorical nodes we have processed
   // so far.
-  split_types_.resize(n_nodes, FeatureType::kNumerical);
-  split_categories_segments_.resize(n_nodes);
+  auto& h_split_types = split_types_.HostVector();
+  h_split_types.resize(n_nodes, FeatureType::kNumerical);
+  auto& h_split_categories_segments = split_categories_segments_.HostVector();
+  h_split_categories_segments.resize(n_nodes);
+  auto& h_split_categories = this->split_categories_.HostVector();
+
   for (bst_node_t nidx = 0; nidx < n_nodes; ++nidx) {
-    split_types_[nidx] = static_cast<FeatureType>(GetElem<Integer>(split_type, nidx));
+    h_split_types[nidx] = static_cast<FeatureType>(GetElem<Integer>(split_type, nidx));
     if (nidx == last_cat_node) {
       auto j_begin = GetElem<Integer>(categories_segments, cnt);
       auto j_end = GetElem<Integer>(categories_sizes, cnt) + j_begin;
@@ -964,12 +985,12 @@ void RegTree::LoadCategoricalSplit(Json const& in) {
         cat_bits.Set(common::AsCat(GetElem<Integer>(categories, j)));
       }
 
-      auto begin = split_categories_.size();
-      split_categories_.resize(begin + cat_bits_storage.size());
+      auto begin = h_split_categories.size();
+      h_split_categories.resize(begin + cat_bits_storage.size());
       std::copy(cat_bits_storage.begin(), cat_bits_storage.end(),
-                split_categories_.begin() + begin);
-      split_categories_segments_[nidx].beg = begin;
-      split_categories_segments_[nidx].size = cat_bits_storage.size();
+                h_split_categories.begin() + begin);
+      h_split_categories_segments[nidx].beg = begin;
+      h_split_categories_segments[nidx].size = cat_bits_storage.size();
 
       ++cnt;
       if (cnt == categories_nodes.size()) {
@@ -978,8 +999,8 @@ void RegTree::LoadCategoricalSplit(Json const& in) {
         last_cat_node = GetElem<Integer>(categories_nodes, cnt);
       }
     } else {
-      split_categories_segments_[nidx].beg = categories.size();
-      split_categories_segments_[nidx].size = 0;
+      h_split_categories_segments[nidx].beg = categories.size();
+      h_split_categories_segments[nidx].size = 0;
     }
   }
 }
@@ -989,23 +1010,26 @@ template void RegTree::LoadCategoricalSplit<false>(Json const& in);
 
 void RegTree::SaveCategoricalSplit(Json* p_out) const {
   auto& out = *p_out;
-  CHECK_EQ(this->split_types_.size(), this->Size());
+  CHECK_EQ(this->split_types_.Size(), this->Size());
   CHECK_EQ(this->GetSplitCategoriesPtr().size(), this->Size());
 
   I64Array categories_segments;
   I64Array categories_sizes;
   I32Array categories;        // bst_cat_t = int32_t
   I32Array categories_nodes;  // bst_note_t = int32_t
-  U8Array split_type(split_types_.size());
+  U8Array split_type(split_types_.Size());
 
-  for (size_t i = 0; i < nodes_.size(); ++i) {
-    split_type.Set(i, static_cast<std::underlying_type_t<FeatureType>>(this->NodeSplitType(i)));
-    if (this->split_types_[i] == FeatureType::kCategorical) {
+  auto const& h_split_types = this->split_types_.ConstHostVector();
+  auto const& h_split_categories_segments = this->split_categories_segments_.ConstHostVector();
+
+  for (size_t i = 0; i < nodes_.Size(); ++i) {
+    split_type.Set(i, static_cast<std::underlying_type_t<FeatureType>>(h_split_types[i]));
+    if (h_split_types[i] == FeatureType::kCategorical) {
       categories_nodes.GetArray().emplace_back(static_cast<std::int32_t>(i));
       auto begin = categories.Size();
       categories_segments.GetArray().emplace_back(begin);
-      auto segment = this->split_categories_segments_[i];
-      auto cat_bits = common::GetNodeCats(this->GetSplitCategories(), segment);
+      auto segment = h_split_categories_segments[i];
+      auto cat_bits = common::GetNodeCats(this->GetSplitCategories(DeviceOrd::CPU()), segment);
       for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
         if (cat_bits.Check(i)) {
           categories.GetArray().emplace_back(static_cast<std::int32_t>(i));
@@ -1103,25 +1127,28 @@ void RegTree::LoadModel(Json const& in) {
   }
 
   bool feature_is_64 = IsA<I64Array>(in["split_indices"]);
+  auto& h_stats = this->stats_.HostVector();
+  auto& h_nodes = this->nodes_.HostVector();
   if (typed && feature_is_64) {
-    LoadModelImpl<true, true>(in, param_, &stats_, &nodes_);
+    LoadModelImpl<true, true>(in, param_, &h_stats, &h_nodes);
   } else if (typed && !feature_is_64) {
-    LoadModelImpl<true, false>(in, param_, &stats_, &nodes_);
+    LoadModelImpl<true, false>(in, param_, &h_stats, &h_nodes);
   } else if (!typed && feature_is_64) {
-    LoadModelImpl<false, true>(in, param_, &stats_, &nodes_);
+    LoadModelImpl<false, true>(in, param_, &h_stats, &h_nodes);
   } else {
-    LoadModelImpl<false, false>(in, param_, &stats_, &nodes_);
+    LoadModelImpl<false, false>(in, param_, &h_stats, &h_nodes);
   }
 
   if (!has_cat) {
-    this->split_categories_segments_.resize(this->param_.num_nodes);
-    this->split_types_.resize(this->param_.num_nodes);
-    std::fill(split_types_.begin(), split_types_.end(), FeatureType::kNumerical);
+    this->split_categories_segments_.HostVector().resize(this->param_.num_nodes);
+    auto& h_split_types = this->split_types_.HostVector();
+    h_split_types.resize(this->param_.num_nodes);
+    std::fill(h_split_types.begin(), h_split_types.end(), FeatureType::kNumerical);
   }
 
   deleted_nodes_.clear();
   for (bst_node_t i = 1; i < param_.num_nodes; ++i) {
-    if (nodes_[i].IsDeleted()) {
+    if (h_nodes[i].IsDeleted()) {
       deleted_nodes_.push_back(i);
     }
   }
@@ -1133,7 +1160,7 @@ void RegTree::LoadModel(Json const& in) {
     self[nid].SetParent(self[nid].Parent(), self[parent].LeftChild() == nid);
   }
   CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param_.num_deleted);
-  CHECK_EQ(this->split_categories_segments_.size(), param_.num_nodes);
+  CHECK_EQ(this->split_categories_segments_.Size(), param_.num_nodes);
 }
 
 void RegTree::SaveModel(Json* p_out) const {
@@ -1155,8 +1182,8 @@ void RegTree::SaveModel(Json* p_out) const {
    *  pruner, and this pruner can be used inside another updater so leaf are not necessary
    *  at the end of node array.
    */
-  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.Size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.Size()));
 
   CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param_.num_nodes));
   auto n_nodes = param_.num_nodes;
@@ -1173,19 +1200,22 @@ void RegTree::SaveModel(Json* p_out) const {
 
   F32Array conds(n_nodes);
   U8Array default_left(n_nodes);
-  CHECK_EQ(this->split_types_.size(), param_.num_nodes);
+  CHECK_EQ(this->split_types_.Size(), param_.num_nodes);
 
   namespace tf = tree_field;
 
+  auto const& h_nodes = this->nodes_.ConstHostVector();
+  auto const& h_stats = this->stats_.ConstHostVector();
+
   auto save_tree = [&](auto* p_indices_array) {
     auto& indices_array = *p_indices_array;
     for (bst_node_t i = 0; i < n_nodes; ++i) {
-      auto const& s = stats_[i];
+      auto const& s = h_stats[i];
       loss_changes.Set(i, s.loss_chg);
       sum_hessian.Set(i, s.sum_hess);
       base_weights.Set(i, s.base_weight);
 
-      auto const& n = nodes_[i];
+      auto const& n = h_nodes[i];
       lefts.Set(i, n.LeftChild());
       rights.Set(i, n.RightChild());
       parents.Set(i, n.Parent());
diff --git a/src/tree/tree_view.cc b/src/tree/tree_view.cc
index 4da9635d1271..cad4164f1f9f 100644
--- a/src/tree/tree_view.cc
+++ b/src/tree/tree_view.cc
@@ -29,35 +29,35 @@ auto DispatchWeight(DeviceOrd device, RegTree const* tree) {
 }
 }  // namespace
 
-ScalarTreeView::ScalarTreeView(Context const* /*ctx*/, RegTree const* tree)
-    : nodes{tree->GetNodes().data()},
+ScalarTreeView::ScalarTreeView(Context const* ctx, RegTree const* tree)
+    : CategoriesMixIn{tree->GetCategoriesMatrix(ctx->Device())},
+      nodes{tree->GetNodes().data()},
       stats{tree->GetStats().data()},
-      cats{tree->GetCategoriesMatrix()},
       n{tree->NumNodes()} {
   CHECK(!tree->IsMultiTarget());
 }
 
 MultiTargetTreeView::MultiTargetTreeView(Context const* ctx, RegTree const* tree)
-    : left{DispatchPtr(ctx, tree->GetMultiTargetTree()->left_)},
+    : CategoriesMixIn{tree->GetCategoriesMatrix(ctx->Device())},
+      left{DispatchPtr(ctx, tree->GetMultiTargetTree()->left_)},
       right{DispatchPtr(ctx, tree->GetMultiTargetTree()->right_)},
       parent{DispatchPtr(ctx, tree->GetMultiTargetTree()->parent_)},
       split_index{DispatchPtr(ctx, tree->GetMultiTargetTree()->split_index_)},
       default_left{DispatchPtr(ctx, tree->GetMultiTargetTree()->default_left_)},
       split_conds{DispatchPtr(ctx, tree->GetMultiTargetTree()->split_conds_)},
-      cats{tree->GetCategoriesMatrix()},
       n{tree->NumNodes()},
       weights{DispatchWeight(ctx->Device(), tree)} {
   CHECK(tree->IsMultiTarget());
 }
 
 MultiTargetTreeView::MultiTargetTreeView(RegTree const* tree)
-    : left{tree->GetMultiTargetTree()->left_.ConstHostPointer()},
+    : CategoriesMixIn{tree->GetCategoriesMatrix(DeviceOrd::CPU())},
+      left{tree->GetMultiTargetTree()->left_.ConstHostPointer()},
       right{tree->GetMultiTargetTree()->right_.ConstHostPointer()},
       parent{tree->GetMultiTargetTree()->parent_.ConstHostPointer()},
       split_index{tree->GetMultiTargetTree()->split_index_.ConstHostPointer()},
       default_left{tree->GetMultiTargetTree()->default_left_.ConstHostPointer()},
       split_conds{tree->GetMultiTargetTree()->split_conds_.ConstHostPointer()},
-      cats{tree->GetCategoriesMatrix()},
       n{tree->NumNodes()},
       weights{DispatchWeight(DeviceOrd::CPU(), tree)} {
   CHECK(tree->IsMultiTarget());
diff --git a/src/tree/tree_view.h b/src/tree/tree_view.h
index 96bd49c3f1c5..55357aff2d50 100644
--- a/src/tree/tree_view.h
+++ b/src/tree/tree_view.h
@@ -5,9 +5,10 @@
  * original scalar tree `Node` struct is used extensively in the codebase.
  */
 #pragma once
-#include <cstdint>  // for uint8_t
-#include <stack>    // for stack
-#include <utility>  // for move
+#include <algorithm>  // for max
+#include <cstdint>    // for uint8_t
+#include <stack>      // for stack
+#include <utility>    // for move
 
 #include "../common/type.h"      // for GetValueT
 #include "xgboost/base.h"        // for bst_node_t
@@ -57,19 +58,48 @@ struct WalkTreeMixIn {
     }
     return depth;
   }
+
+  [[nodiscard]] bst_node_t MaxDepth(bst_node_t nidx) const {
+    auto self = static_cast<Base const*>(this);
+    if (self->IsLeaf(nidx)) {
+      return 0;
+    }
+    return std::max(this->MaxDepth(self->LeftChild(nidx)) + 1,
+                    this->MaxDepth(self->RightChild(nidx)) + 1);
+  }
+  [[nodiscard]] bst_node_t MaxDepth() const { return this->MaxDepth(RegTree::kRoot); }
+};
+
+struct CategoriesMixIn {
+  RegTree::CategoricalSplitMatrix cats;
+
+  [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
+  [[nodiscard]] XGBOOST_DEVICE RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const {
+    return cats;
+  }
+  /**
+   * @brief Get the bit storage of categories used by a node.
+   */
+  [[nodiscard]] common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
+    auto node_ptr = this->GetCategoriesMatrix().node_ptr;
+    auto categories = this->GetCategoriesMatrix().categories;
+    auto segment = node_ptr[nidx];
+    auto node_cats = categories.subspan(segment.beg, segment.size);
+    return node_cats;
+  }
+  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
 };
 
 /**
  * @brief Tree view for scalar leaf.
  */
-struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView> {
+struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView>, public CategoriesMixIn {
   static bst_node_t constexpr InvalidNodeId() { return RegTree::kInvalidNodeId; }
   static constexpr bst_node_t RootId() { return RegTree::kRoot; }
 
   RegTree::Node const* nodes;
 
   RTreeNodeStat const* stats;
-  RegTree::CategoricalSplitMatrix cats;
   // The number of nodes
   bst_node_t n{0};
 
@@ -105,6 +135,7 @@ struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView> {
     return this->nodes[nidx].LeafValue();
   }
 
+  [[nodiscard]] bst_target_t NumTargets() const { return 1; }
   [[nodiscard]] XGBOOST_DEVICE bst_node_t Size() const { return this->n; }
   [[nodiscard]] XGBOOST_DEVICE bool IsRoot(bst_node_t nidx) const {
     return this->nodes[nidx].IsRoot();
@@ -114,23 +145,17 @@ struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView> {
   [[nodiscard]] auto SumHess(bst_node_t nidx) const { return stats[nidx].sum_hess; }
   [[nodiscard]] auto LossChg(bst_node_t nidx) const { return stats[nidx].loss_chg; }
 
-  [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
-  [[nodiscard]] XGBOOST_DEVICE RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const {
-    return cats;
-  }
-  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
-
   XGBOOST_DEVICE explicit ScalarTreeView(RegTree::Node const* nodes, RTreeNodeStat const* stats,
                                          RegTree::CategoricalSplitMatrix cats, bst_node_t n_nodes)
-      : nodes{nodes}, stats{stats}, cats{std::move(cats)}, n{n_nodes} {}
+      : CategoriesMixIn{std::move(cats)}, nodes{nodes}, stats{stats}, n{n_nodes} {}
 
   /** @brief Create a device view, not implemented yet. */
   explicit ScalarTreeView(Context const* ctx, RegTree const* tree);
   /** @brief Create a host view */
   explicit ScalarTreeView(RegTree const* tree)
-      : nodes{tree->GetNodes().data()},
+      : CategoriesMixIn{tree->GetCategoriesMatrix(DeviceOrd::CPU())},
+        nodes{tree->GetNodes().data()},
         stats{tree->GetStats().data()},
-        cats{tree->GetCategoriesMatrix()},
         n{tree->NumNodes()} {
     CHECK(!tree->IsMultiTarget());
   }
@@ -139,7 +164,7 @@ struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView> {
 /**
  * @brief A view to the @ref MultiTargetTree suitable for both host and device.
  */
-struct MultiTargetTreeView : public WalkTreeMixIn<MultiTargetTreeView> {
+struct MultiTargetTreeView : public WalkTreeMixIn<MultiTargetTreeView>, public CategoriesMixIn {
   static bst_node_t constexpr InvalidNodeId() { return MultiTargetTree::InvalidNodeId(); }
 
   bst_node_t const* left;
@@ -150,8 +175,6 @@ struct MultiTargetTreeView : public WalkTreeMixIn<MultiTargetTreeView> {
   std::uint8_t const* default_left;
   float const* split_conds;
 
-  RegTree::CategoricalSplitMatrix cats;
-
   // The number of nodes
   bst_node_t n{0};
 
@@ -195,25 +218,29 @@ struct MultiTargetTreeView : public WalkTreeMixIn<MultiTargetTreeView> {
     LOG(FATAL) << "Tree statistic " << MTNotImplemented();
     return 0.0f;
   }
-
-  [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
-  [[nodiscard]] RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const { return cats; }
-  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
-
   /** @brief Create a device view */
   explicit MultiTargetTreeView(Context const* ctx, RegTree const* tree);
   /** @brief Create a host view */
   explicit MultiTargetTreeView(RegTree const* tree);
 };
 
-template <typename Fn>
-void WalkTree(RegTree const& tree, Fn&& fn) {
+/**
+ * @brief Iterate through all nodes in a tree.
+ *
+ * @param tree  The tree to traversal
+ * @param fn    See @ref WalkTreeMixIn , addition tree views are passed into the function if @ref
+ *              trees is not empty.
+ * @param trees Additional trees that have the same target type as @ref tree . We can
+ *              dispatch all trees together for easier access.
+ */
+template <typename Fn, typename... Tree>
+void WalkTree(RegTree const& tree, Fn&& fn, Tree const&... trees) {
   if (tree.IsMultiTarget()) {
     auto mt_tree = tree.HostMtView();
-    mt_tree.WalkTree([&](bst_node_t nidx) { return fn(mt_tree, nidx); });
+    mt_tree.WalkTree([&](bst_node_t nidx) { return fn(mt_tree, trees.HostMtView()..., nidx); });
   } else {
     auto sc_tree = tree.HostScView();
-    sc_tree.WalkTree([&](bst_node_t nidx) { return fn(sc_tree, nidx); });
+    sc_tree.WalkTree([&](bst_node_t nidx) { return fn(sc_tree, trees.HostScView()..., nidx); });
   }
 }
 
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 885150144b84..dadcc94a1a24 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -112,7 +112,7 @@ class GlobalApproxBuilder {
     collective::SafeColl(rc);
 
     std::vector<CPUExpandEntry> nodes{best};
-    this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
+    this->histogram_builder_.BuildRootHist(p_fmat, p_tree->HostScView(), partitioner_,
                                            linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
                                            best, BatchSpec(*param_, hess));
 
@@ -143,7 +143,7 @@ class GlobalApproxBuilder {
                       std::vector<GradientPair> const &gpair, common::Span<float> hess) {
     monitor_->Start(__func__);
     this->histogram_builder_.BuildHistLeftRight(
-        ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
+        ctx_, p_fmat, p_tree->HostScView(), partitioner_, valid_candidates,
         linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
     monitor_->Stop(__func__);
   }
@@ -156,7 +156,7 @@ class GlobalApproxBuilder {
     }
     p_out_position->resize(hess.size());
     for (auto const &part : partitioner_) {
-      part.LeafPartition(ctx_, tree, hess,
+      part.LeafPartition(ctx_, tree.HostScView(), hess,
                          common::Span{p_out_position->data(), p_out_position->size()});
     }
     monitor_->Stop(__func__);
@@ -212,7 +212,7 @@ class GlobalApproxBuilder {
       size_t page_id = 0;
       for (auto const &page :
            p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
-        partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
+        partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree->HostScView());
         page_id++;
       }
       monitor_->Stop("UpdatePosition");
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index f71fd189db03..807e1089ee8d 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -10,10 +10,11 @@
 
 #include "../common/error_msg.h"  // for NoCategorical
 #include "../common/random.h"
-#include "sample_position.h"  // for SamplePosition
 #include "constraints.h"
 #include "param.h"
+#include "sample_position.h"  // for SamplePosition
 #include "split_evaluator.h"
+#include "tree_view.h"  // for ScalarTreeView
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
 #include "xgboost/parameter.h"
@@ -260,16 +261,16 @@ class ColMaker: public TreeUpdater {
      * \brief initialize the base_weight, root_gain,
      *  and NodeEntry for all the new nodes in qexpand
      */
-    inline void InitNewNode(const std::vector<int>& qexpand,
-                            const std::vector<GradientPair>& gpair,
-                            const DMatrix& fmat,
-                            const RegTree& tree) {
+    void InitNewNode(const std::vector<int> &qexpand, const std::vector<GradientPair> &gpair,
+                     const DMatrix &fmat, RegTree const& tree) {
+      auto n_nodes = tree.NumNodes();
+      auto sc_tree = tree.HostScView();
       {
         // setup statistics space for each tree node
-        for (auto& i : stemp_) {
-          i.resize(tree.NumNodes(), ThreadEntry());
+        for (auto &i : stemp_) {
+          i.resize(n_nodes, ThreadEntry());
         }
-        snode_.resize(tree.NumNodes(), NodeEntry());
+        snode_.resize(n_nodes, NodeEntry());
       }
       const MetaInfo& info = fmat.Info();
       // setup position
@@ -290,23 +291,23 @@ class ColMaker: public TreeUpdater {
 
       auto evaluator = tree_evaluator_.GetEvaluator();
       // calculating the weights
-      for (int nid : qexpand) {
-        bst_node_t parentid = tree[nid].Parent();
-        snode_[nid].weight = static_cast<float>(
-            evaluator.CalcWeight(parentid, param_, snode_[nid].stats));
-        snode_[nid].root_gain = static_cast<float>(
-            evaluator.CalcGain(parentid, param_, snode_[nid].stats));
+      for (bst_node_t nidx : qexpand) {
+        bst_node_t parentid = sc_tree.Parent(nidx);
+        snode_[nidx].weight =
+            static_cast<float>(evaluator.CalcWeight(parentid, param_, snode_[nidx].stats));
+        snode_[nidx].root_gain =
+            static_cast<float>(evaluator.CalcGain(parentid, param_, snode_[nidx].stats));
       }
     }
     /*! \brief update queue expand add in new leaves */
-    inline void UpdateQueueExpand(const RegTree& tree,
-                                  const std::vector<int> &qexpand,
-                                  std::vector<int>* p_newnodes) {
+    void UpdateQueueExpand(RegTree const &tree, const std::vector<bst_node_t> &qexpand,
+                           std::vector<int> *p_newnodes) {
       p_newnodes->clear();
-      for (int nid : qexpand) {
-        if (!tree[ nid ].IsLeaf()) {
-          p_newnodes->push_back(tree[nid].LeftChild());
-          p_newnodes->push_back(tree[nid].RightChild());
+      auto sc_tree = tree.HostScView();
+      for (bst_node_t nidx : qexpand) {
+        if (!sc_tree.IsLeaf(nidx)) {
+          p_newnodes->push_back(sc_tree.LeftChild(nidx));
+          p_newnodes->push_back(sc_tree.RightChild(nidx));
         }
       }
     }
@@ -504,9 +505,8 @@ class ColMaker: public TreeUpdater {
       }
     }
     // reset position of each data points after split is created in the tree
-    inline void ResetPosition(const std::vector<int> &qexpand,
-                              DMatrix* p_fmat,
-                              const RegTree& tree) {
+    void ResetPosition(const std::vector<int> &qexpand, DMatrix *p_fmat, const RegTree &tree) {
+      auto sc_tree = tree.HostScView();
       // set the positions in the nondefault
       this->SetNonDefaultPosition(qexpand, p_fmat, tree);
       // set rest of instances to default position
@@ -516,18 +516,18 @@ class ColMaker: public TreeUpdater {
       common::ParallelFor(p_fmat->Info().num_row_, this->ctx_->Threads(), [&](auto ridx) {
         CHECK_LT(ridx, position_.size()) << "ridx exceed bound "
                                          << "ridx=" << ridx << " pos=" << position_.size();
-        const int nid = SamplePosition::Decode(position_[ridx]);
-        if (tree[nid].IsLeaf()) {
+        const bst_node_t nidx = SamplePosition::Decode(position_[ridx]);
+        if (sc_tree.IsLeaf(nidx)) {
           // mark finish when it is not a fresh leaf
-          if (tree[nid].RightChild() == -1) {
-            position_[ridx] = ~nid;
+          if (sc_tree.RightChild(nidx) == -1) {
+            position_[ridx] = ~nidx;
           }
         } else {
           // push to default branch
-          if (tree[nid].DefaultLeft()) {
-            this->SetEncodePosition(ridx, tree[nid].LeftChild());
+          if (sc_tree.DefaultLeft(nidx)) {
+            this->SetEncodePosition(ridx, sc_tree.LeftChild(nidx));
           } else {
-            this->SetEncodePosition(ridx, tree[nid].RightChild());
+            this->SetEncodePosition(ridx, sc_tree.RightChild(nidx));
           }
         }
       });
@@ -543,14 +543,14 @@ class ColMaker: public TreeUpdater {
         }
       }
     }
-    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
-                                       DMatrix *p_fmat,
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand, DMatrix *p_fmat,
                                        const RegTree &tree) {
       // step 1, classify the non-default data into right places
+      auto sc_tree = tree.HostScView();
       std::vector<unsigned> fsplits;
       for (int nid : qexpand) {
-        if (!tree[nid].IsLeaf()) {
-          fsplits.push_back(tree[nid].SplitIndex());
+        if (!sc_tree.IsLeaf(nid)) {
+          fsplits.push_back(sc_tree.SplitIndex(nid));
         }
       }
       std::sort(fsplits.begin(), fsplits.end());
@@ -562,13 +562,13 @@ class ColMaker: public TreeUpdater {
           common::ParallelFor(col.size(), this->ctx_->Threads(), [&](auto j) {
             const bst_uint ridx = col[j].index;
             bst_node_t nidx = SamplePosition::Decode(position_[ridx]);
-            const bst_float fvalue = col[j].fvalue;
+            const float fvalue = col[j].fvalue;
             // go back to parent, correct those who are not default
-            if (!tree[nidx].IsLeaf() && tree[nidx].SplitIndex() == fid) {
-              if (fvalue < tree[nidx].SplitCond()) {
-                this->SetEncodePosition(ridx, tree[nidx].LeftChild());
+            if (!sc_tree.IsLeaf(nidx) && sc_tree.SplitIndex(nidx) == fid) {
+              if (fvalue < sc_tree.SplitCond(nidx)) {
+                this->SetEncodePosition(ridx, sc_tree.LeftChild(nidx));
               } else {
-                this->SetEncodePosition(ridx, tree[nidx].RightChild());
+                this->SetEncodePosition(ridx, sc_tree.RightChild(nidx));
               }
             }
           });
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 1708f7c6f032..1af061738211 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -35,6 +35,7 @@
 #include "hist/hist_param.h"             // for HistMakerTrainParam
 #include "param.h"                       // for TrainParam
 #include "sample_position.h"             // for SamplePosition
+#include "tree_view.h"                   // for ScalarTreeView
 #include "updater_gpu_common.cuh"        // for HistBatch
 #include "xgboost/base.h"                // for bst_idx_t
 #include "xgboost/context.h"             // for Context
@@ -71,7 +72,7 @@ static_assert(std::is_trivially_copyable_v<NodeSplitData>);
 void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
                  std::vector<GPUExpandEntry> const& candidates,
                  common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
-  auto const& tree = *p_tree;
+  auto const& tree = p_tree->HostScView();
   std::size_t nidx_in_set{0};
   auto p_build_nidx = nodes_to_build.data();
   auto p_sub_nidx = nodes_to_sub.data();
@@ -84,11 +85,11 @@ void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
     auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
     bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
     if (fewer_right) {
-      p_build_nidx[nidx_in_set] = tree[e.nid].RightChild();
-      p_sub_nidx[nidx_in_set] = tree[e.nid].LeftChild();
+      p_build_nidx[nidx_in_set] = tree.RightChild(e.nid);
+      p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nid);
     } else {
-      p_build_nidx[nidx_in_set] = tree[e.nid].LeftChild();
-      p_sub_nidx[nidx_in_set] = tree[e.nid].RightChild();
+      p_build_nidx[nidx_in_set] = tree.LeftChild(e.nid);
+      p_sub_nidx[nidx_in_set] = tree.RightChild(e.nid);
     }
     ++nidx_in_set;
   }
@@ -128,13 +129,14 @@ struct GPUHistMakerDevice {
   PartitionNodes CreatePartitionNodes(RegTree const* p_tree,
                                       std::vector<GPUExpandEntry> const& candidates) {
     PartitionNodes nodes(candidates.size());
+    auto tree = p_tree->HostScView();
     for (std::size_t i = 0, n = candidates.size(); i < n; i++) {
       auto const& e = candidates[i];
-      RegTree::Node split_node = (*p_tree)[e.nid];
-      auto split_type = p_tree->NodeSplitType(e.nid);
+      RegTree::Node split_node = tree.nodes[e.nid];
+      auto split_type = tree.SplitType(e.nid);
       nodes.nidx.at(i) = e.nid;
-      nodes.left_nidx[i] = split_node.LeftChild();
-      nodes.right_nidx[i] = split_node.RightChild();
+      nodes.left_nidx[i] = tree.LeftChild(e.nid);
+      nodes.right_nidx[i] = tree.RightChild(e.nid);
       nodes.split_data[i] =
           NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nid)};
 
@@ -294,10 +296,11 @@ struct GPUHistMakerDevice {
     dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
     // Store the feature set ptrs so they dont go out of scope before the kernel is called
     std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
+    auto sc_tree = tree.HostScView();
     for (std::size_t i = 0; i < candidates.size(); i++) {
       auto candidate = candidates.at(i);
-      int left_nidx = tree[candidate.nid].LeftChild();
-      int right_nidx = tree[candidate.nid].RightChild();
+      bst_node_t left_nidx = sc_tree.LeftChild(candidate.nid);
+      bst_node_t right_nidx = sc_tree.RightChild(candidate.nid);
       nidx[i * 2] = left_nidx;
       nidx[i * 2 + 1] = right_nidx;
       auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
@@ -618,17 +621,18 @@ struct GPUHistMakerDevice {
     }
 
     dh::CachingDeviceUVector<std::uint32_t> categories;
-    dh::CopyTo(p_tree->GetSplitCategories(), &categories, this->ctx_->CUDACtx()->Stream());
+    dh::CopyTo(p_tree->GetSplitCategories(DeviceOrd::CPU()), &categories,
+               this->ctx_->CUDACtx()->Stream());
     auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
     auto d_categories = dh::ToSpan(categories);
     auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
+    auto const& tree = p_tree->HostScView();
 
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
       std::vector<NodeSplitData> split_data(p_tree->NumNodes());
-      auto const& tree = *p_tree;
       for (std::size_t i = 0, n = split_data.size(); i < n; ++i) {
-        RegTree::Node split_node = tree[i];
-        auto split_type = p_tree->NodeSplitType(i);
+        RegTree::Node split_node = tree.nodes[i];
+        auto split_type = tree.SplitType(i);
         auto node_cats = common::GetNodeCats(d_categories, cat_segments[i]);
         split_data[i] = NodeSplitData{std::move(split_node), split_type, node_cats};
       }
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index cc503f90f8ff..0d77092fd96f 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -136,7 +136,8 @@ class MultiTargetHistBuilder {
     monitor_->Start(__func__);
     std::size_t page_id{0};
     for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(this->param_))) {
-      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
+      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied,
+                                                    p_tree->HostMtView());
       page_id++;
     }
     monitor_->Stop(__func__);
@@ -202,7 +203,8 @@ class MultiTargetHistBuilder {
         linalg::MakeVec(reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2));
     collective::SafeColl(rc);
 
-    histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
+    histogram_builder_->BuildRootHist(p_fmat, p_tree->HostMtView(), partitioner_, gpair, best,
+                                      HistBatch(param_));
 
     auto weight = evaluator_->InitRoot(root_sum);
     auto weight_t = weight.HostView();
@@ -228,8 +230,8 @@ class MultiTargetHistBuilder {
                       std::vector<MultiExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
-                                           gpair, HistBatch(param_));
+    histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree->HostMtView(), partitioner_,
+                                           valid_candidates, gpair, HistBatch(param_));
     monitor_->Stop(__func__);
   }
 
@@ -256,7 +258,7 @@ class MultiTargetHistBuilder {
     }
     p_out_position->resize(gpair.Shape(0));
     for (auto const &part : partitioner_) {
-      part.LeafPartition(ctx_, tree, gpair,
+      part.LeafPartition(ctx_, tree.HostMtView(), gpair,
                          common::Span{p_out_position->data(), p_out_position->size()});
     }
     monitor_->Stop(__func__);
@@ -390,7 +392,7 @@ class HistUpdater {
     monitor_->Start(__func__);
     CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
 
-    this->histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, node,
+    this->histogram_builder_->BuildRootHist(p_fmat, p_tree->HostScView(), partitioner_, gpair, node,
                                             HistBatch(param_));
 
     {
@@ -446,7 +448,7 @@ class HistUpdater {
                       std::vector<CPUExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    this->histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_,
+    this->histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree->HostScView(), partitioner_,
                                                  valid_candidates, gpair, HistBatch(param_));
     monitor_->Stop(__func__);
   }
@@ -456,7 +458,8 @@ class HistUpdater {
     monitor_->Start(__func__);
     std::size_t page_id{0};
     for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
+      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied,
+                                                    p_tree->HostScView());
       page_id++;
     }
     monitor_->Stop(__func__);
@@ -471,7 +474,7 @@ class HistUpdater {
     }
     p_out_position->resize(gpair.Shape(0));
     for (auto const &part : partitioner_) {
-      part.LeafPartition(ctx_, tree, gpair,
+      part.LeafPartition(ctx_, tree.HostScView(), gpair,
                          common::Span{p_out_position->data(), p_out_position->size()});
     }
     monitor_->Stop(__func__);
diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc
index 199aa7e2d7cb..ca8c066d0dcc 100644
--- a/tests/cpp/plugin/test_sycl_hist_updater.cc
+++ b/tests/cpp/plugin/test_sycl_hist_updater.cc
@@ -431,7 +431,8 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
 
     size_t n_nodes = nodes.size();
     std::vector<int32_t> split_conditions(n_nodes);
-    xgboost::tree::CommonRowPartitioner::FindSplitConditions(nodes, tree, gmat, &split_conditions);
+    xgboost::tree::CommonRowPartitioner::FindSplitConditions(nodes, tree.HostScView(), gmat,
+                                                             &split_conditions);
 
     common::PartitionBuilder partition_builder;
     partition_builder.Init(qu, n_nodes, [&](size_t node_in_set) {
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 32bd07d71273..c62c9cdf84a4 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -81,27 +81,27 @@ TEST(CpuPredictor, ArrayTreeLayout) {
   auto sc_tree = tree::ScalarTreeView{&ctx, &tree};
   {
     constexpr int kDepth = 1;
-    LayoutForTest<kDepth> buffer(sc_tree, tree.GetCategoriesMatrix());
+    LayoutForTest<kDepth> buffer(sc_tree, sc_tree.GetCategoriesMatrix());
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 2;
-    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
+    LayoutForTest<kDepth> buffer{sc_tree, sc_tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 3;
-    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
+    LayoutForTest<kDepth> buffer{sc_tree, sc_tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 4;
-    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
+    LayoutForTest<kDepth> buffer{sc_tree, sc_tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
   {
     constexpr int kDepth = 5;
-    LayoutForTest<kDepth> buffer{sc_tree, tree.GetCategoriesMatrix()};
+    LayoutForTest<kDepth> buffer{sc_tree, sc_tree.GetCategoriesMatrix()};
     CheckArrayLayout(tree, buffer, kDepth, 0, 0, 0);
   }
 }
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 766f325d777e..164c3d8d5a06 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -18,6 +18,7 @@
 #include "../../../src/common/bitfield.h"         // for LBitField32
 #include "../../../src/data/iterative_dmatrix.h"  // for IterativeDMatrix
 #include "../../../src/data/proxy_dmatrix.h"      // for DMatrixProxy
+#include "../../../src/tree/tree_view.h"          // for MultiTargetTreeView
 #include "../collective/test_worker.h"            // for TestDistributedGlobal
 #include "../helpers.h"                           // for GetDMatrixFromData, RandomDataGenerator
 #include "xgboost/json.h"                         // for Json, Object, get, String
@@ -813,7 +814,8 @@ void TestVectorLeafPrediction(Context const *ctx) {
   };
 
   // go to right
-  HostDeviceVector<float> data(kRows * kCols, model.trees.front()->SplitCond(RegTree::kRoot) + 1.0);
+  auto mt_tree = model.trees.front()->HostMtView();
+  HostDeviceVector<float> data(kRows * kCols, mt_tree.SplitCond(RegTree::kRoot) + 1.0);
   test_batch(2.5, &data);
   if (!ctx->IsCUDA()) {
     test_inplace(2.5, &data);
@@ -821,7 +823,7 @@ void TestVectorLeafPrediction(Context const *ctx) {
   }
 
   // go to left
-  data.HostVector().assign(data.Size(), model.trees.front()->SplitCond(RegTree::kRoot) - 1.0);
+  data.HostVector().assign(data.Size(), mt_tree.SplitCond(RegTree::kRoot) - 1.0);
   test_batch(1.5, &data);
   if (!ctx->IsCUDA()) {
     test_inplace(1.5, &data);
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index f8bff7656aa9..36da8209d098 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -11,14 +11,14 @@
 #include <xgboost/span.h>                // for Span, operator!=
 #include <xgboost/tree_model.h>          // for RegTree
 
-#include <algorithm>   // for max
-#include <cstddef>     // for size_t
-#include <cstdint>     // for int32_t, uint32_t
-#include <iterator>    // for back_inserter
-#include <limits>      // for numeric_limits
-#include <memory>      // for shared_ptr, allocator, unique_ptr
-#include <numeric>     // for iota, accumulate
-#include <vector>      // for vector
+#include <algorithm>  // for max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, uint32_t
+#include <iterator>   // for back_inserter
+#include <limits>     // for numeric_limits
+#include <memory>     // for shared_ptr, allocator, unique_ptr
+#include <numeric>    // for iota, accumulate
+#include <vector>     // for vector
 
 #include "../../../../src/collective/communicator-inl.h"  // for GetRank, GetWorldSize
 #include "../../../../src/common/hist_util.h"             // for GHistRow, HistogramCuts, Sketch...
@@ -29,8 +29,9 @@
 #include "../../../../src/tree/common_row_partitioner.h"  // for CommonRowPartitioner
 #include "../../../../src/tree/hist/expand_entry.h"       // for CPUExpandEntry
 #include "../../../../src/tree/hist/hist_cache.h"         // for BoundedHistCollection
-#include "../../../../src/tree/hist/histogram.h"          // for HistogramBuilder
 #include "../../../../src/tree/hist/hist_param.h"         // for HistMakerTrainParam
+#include "../../../../src/tree/hist/histogram.h"          // for HistogramBuilder
+#include "../../../../src/tree/tree_view.h"               // for ScalarTreeView
 #include "../../categorical_helpers.h"                    // for OneHotEncodeFeature
 #include "../../collective/test_worker.h"                 // for TestDistributedGlobal
 #include "../../helpers.h"                                // for RandomDataGenerator, GenerateRa...
@@ -70,7 +71,7 @@ void TestAddHistRows(bool is_distributed) {
   HistogramBuilder histogram_builder;
   histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, false,
                           &hist_param);
-  histogram_builder.AddHistRows(&tree, &nodes_to_build, &nodes_to_sub, false);
+  histogram_builder.AddHistRows(tree.HostScView(), &nodes_to_build, &nodes_to_sub, false);
 
   for (bst_node_t const &nidx : nodes_to_build) {
     ASSERT_TRUE(histogram_builder.Histogram().HistogramExists(nidx));
@@ -115,7 +116,8 @@ void TestSyncHist(bool is_distributed) {
 
   // level 0
   nodes_for_explicit_hist_build.emplace_back(0);
-  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);
+  histogram.AddHistRows(tree.HostScView(), &nodes_for_explicit_hist_build,
+                        &nodes_for_subtraction_trick, false);
 
   tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   nodes_for_explicit_hist_build.clear();
@@ -125,7 +127,8 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_explicit_hist_build.emplace_back(tree[0].LeftChild());
   nodes_for_subtraction_trick.emplace_back(tree[0].RightChild());
 
-  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);
+  histogram.AddHistRows(tree.HostScView(), &nodes_for_explicit_hist_build,
+                        &nodes_for_subtraction_trick, false);
 
   tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
@@ -138,7 +141,8 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_explicit_hist_build.emplace_back(5);
   nodes_for_subtraction_trick.emplace_back(6);
 
-  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);
+  histogram.AddHistRows(tree.HostScView(), &nodes_for_explicit_hist_build,
+                        &nodes_for_subtraction_trick, false);
 
   const size_t n_nodes = nodes_for_explicit_hist_build.size();
   ASSERT_EQ(n_nodes, 2ul);
@@ -181,7 +185,8 @@ void TestSyncHist(bool is_distributed) {
 
   histogram.Buffer().Reset(1, n_nodes, space, target_hists);
   // sync hist
-  histogram.SyncHistogram(&ctx, &tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
+  histogram.SyncHistogram(&ctx, tree.HostScView(), nodes_for_explicit_hist_build,
+                          nodes_for_subtraction_trick);
 
   using GHistRowT = common::GHistRow;
   auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right,
@@ -258,14 +263,14 @@ void TestBuildHistogram(Context const* ctx, bool is_distributed, bool force_read
   std::vector<bst_node_t> nodes_to_build{node.nid};
   std::vector<bst_node_t> dummy_sub;
 
-  histogram.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+  histogram.AddHistRows(tree.HostScView(), &nodes_to_build, &dummy_sub, false);
   common::BlockedSpace2d space{
       1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};
   for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx, {kMaxBins, 0.5})) {
     histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
                         linalg::MakeTensorView(ctx, gpair, gpair.size()), force_read_by_column);
   }
-  histogram.SyncHistogram(ctx, &tree, nodes_to_build, {});
+  histogram.SyncHistogram(ctx, tree.HostScView(), nodes_to_build, {});
 
   // Check if number of histogram bins is correct
   ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back());
@@ -365,12 +370,12 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
     cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
-    cat_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    cat_hist.AddHistRows(tree.HostScView(), &nodes_to_build, &dummy_sub, false);
     cat_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
                        linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
                        force_read_by_column);
   }
-  cat_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {});
+  cat_hist.SyncHistogram(&ctx, tree.HostScView(), nodes_to_build, {});
 
   /**
    * Generate hist with one hot encoded data.
@@ -381,12 +386,12 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
     onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
-    onehot_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    onehot_hist.AddHistRows(tree.HostScView(), &nodes_to_build, &dummy_sub, false);
     onehot_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
                           linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
                           force_read_by_column);
   }
-  onehot_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {});
+  onehot_hist.SyncHistogram(&ctx, tree.HostScView(), nodes_to_build, {});
 
   auto cat = cat_hist.Histogram()[0];
   auto onehot = onehot_hist.Histogram()[0];
@@ -448,7 +453,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     ASSERT_EQ(n_samples, m->Info().num_row_);
 
     multi_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
-    multi_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    multi_build.AddHistRows(tree.HostScView(), &nodes, &dummy_sub, false);
     std::size_t page_idx{0};
     for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
       multi_build.BuildHist(page_idx, space, page, rows_set[page_idx], nodes,
@@ -456,7 +461,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
                             force_read_by_column);
       ++page_idx;
     }
-    multi_build.SyncHistogram(ctx, &tree, nodes, {});
+    multi_build.SyncHistogram(ctx, tree.HostScView(), nodes, {});
 
     multi_page = multi_build.Histogram()[RegTree::kRoot];
   }
@@ -481,11 +486,11 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false,
                           std::numeric_limits<double>::quiet_NaN(), ctx->Threads());
 
-    single_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    single_build.AddHistRows(tree.HostScView(), &nodes, &dummy_sub, false);
     single_build.BuildHist(0, space, gmat, row_set_collection, nodes,
                            linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
                            force_read_by_column);
-    single_build.SyncHistogram(ctx, &tree, nodes, {});
+    single_build.SyncHistogram(ctx, tree.HostScView(), nodes, {});
 
     single_page = single_build.Histogram()[RegTree::kRoot];
   }
@@ -556,7 +561,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
     auto gpair = GenerateRandomGradients(Xy->Info().num_row_, 0.0, 1.0);
 
     CPUExpandEntry best;
-    hist_builder.BuildRootHist(Xy.get(), &tree, partitioners,
+    hist_builder.BuildRootHist(Xy.get(), tree.HostScView(), partitioners,
                                linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1),
                                best, batch);
 
@@ -569,13 +574,13 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
 
     std::vector<CPUExpandEntry> valid_candidates{best};
     for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
-      partitioners.front().UpdatePosition(&ctx, page, valid_candidates, &tree);
+      partitioners.front().UpdatePosition(&ctx, page, valid_candidates, tree.HostScView());
     }
     CHECK_NE(partitioners.front()[tree.LeftChild(best.nid)].Size(), 0);
     CHECK_NE(partitioners.front()[tree.RightChild(best.nid)].Size(), 0);
 
     hist_builder.BuildHistLeftRight(
-        &ctx, Xy.get(), &tree, partitioners, valid_candidates,
+        &ctx, Xy.get(), tree.HostScView(), partitioners, valid_candidates,
         linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1), batch);
 
     if (limit) {
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 1137bf56e23d..29d734c1fce1 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -48,7 +48,7 @@ TEST(Approx, Partitioner) {
       RegTree tree;
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       GetSplit(&tree, min_value, &candidates);
-      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+      partitioner.UpdatePosition(&ctx, page, candidates, tree.HostScView());
       ASSERT_EQ(partitioner.Size(), 3);
       ASSERT_EQ(partitioner[1].Size(), 0);
       ASSERT_EQ(partitioner[2].Size(), n_samples);
@@ -59,7 +59,7 @@ TEST(Approx, Partitioner) {
       float split_value = page.cut.Values().at(ptr / 2);
       RegTree tree;
       GetSplit(&tree, split_value, &candidates);
-      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+      partitioner.UpdatePosition(&ctx, page, candidates, tree.HostScView());
 
       {
         auto left_nidx = tree[RegTree::kRoot].LeftChild();
@@ -146,7 +146,7 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
       RegTree tree;
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
       GetSplit(&tree, min_value, &candidates);
-      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+      partitioner.UpdatePosition(&ctx, page, candidates, tree.HostScView());
       ASSERT_EQ(partitioner.Size(), 3);
       ASSERT_EQ(partitioner[1].Size(), 0);
       ASSERT_EQ(partitioner[2].Size(), n_samples);
@@ -155,7 +155,7 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
       RegTree tree;
       GetSplit(&tree, mid_value, &candidates);
-      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+      partitioner.UpdatePosition(&ctx, page, candidates, tree.HostScView());
       {
         auto left_nidx = tree[RegTree::kRoot].LeftChild();
         auto const& elem = partitioner[left_nidx];
@@ -200,7 +200,7 @@ TEST(Approx, PartitionerColumnSplit) {
     mid_value = page.cut.Values().at(ptr / 2);
     RegTree tree;
     GetSplit(&tree, mid_value, &candidates);
-    mid_partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+    mid_partitioner.UpdatePosition(&ctx, page, candidates, tree.HostScView());
   }
 
   auto constexpr kWorkers = 4;
diff --git a/tests/cpp/tree/test_common_partitioner.cc b/tests/cpp/tree/test_common_partitioner.cc
index 534aa27ad499..83d2b1f7ce2f 100644
--- a/tests/cpp/tree/test_common_partitioner.cc
+++ b/tests/cpp/tree/test_common_partitioner.cc
@@ -49,9 +49,9 @@ void TestLeafPartition(size_t n_samples) {
     auto ptr = page.cut.Ptrs()[split_ind + 1];
     split_value = page.cut.Values().at(ptr / 2);
     GetSplit(&tree, split_value, &candidates);
-    partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+    partitioner.UpdatePosition(&ctx, page, candidates, tree.HostScView());
     std::vector<bst_node_t> position(page.Size());
-    partitioner.LeafPartition(&ctx, tree, hess, position);
+    partitioner.LeafPartition(&ctx, tree.HostScView(), hess, position);
     std::sort(position.begin(), position.end());
     size_t beg = std::distance(
         position.begin(),
@@ -110,8 +110,8 @@ void TestExternalMemory() {
     }
 
     partitioners.emplace_back(&ctx, page.Size(), page.base_rowid, false);
-    partitioners.back().UpdatePosition(&ctx, page, candidates, &tree);
-    partitioners.back().LeafPartition(&ctx, tree, t_gpair, position);
+    partitioners.back().UpdatePosition(&ctx, page, candidates, tree.HostScView());
+    partitioners.back().LeafPartition(&ctx, tree.HostScView(), t_gpair, position);
   }
 
   bst_idx_t n_left{0};
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 1863c91b6758..362c6c782e09 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -49,10 +49,13 @@ void TestPartitioner(bst_target_t n_targets) {
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, min_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostScView());
       } else {
         GetMultiSplitForTest(&tree, min_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostMtView());
       }
-      partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
       ASSERT_EQ(partitioner.Size(), 3);
       ASSERT_EQ(partitioner[1].Size(), 0);
       ASSERT_EQ(partitioner[2].Size(), n_samples);
@@ -64,10 +67,14 @@ void TestPartitioner(bst_target_t n_targets) {
       RegTree tree{n_targets, n_features};
       if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, split_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostScView());
       } else {
         GetMultiSplitForTest(&tree, split_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostMtView());
       }
-      partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
+
       {
         auto left_nidx = tree.LeftChild(RegTree::kRoot);
         auto const& elem = partitioner[left_nidx];
@@ -121,10 +128,13 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
       if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, min_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostScView());
       } else {
         GetMultiSplitForTest(&tree, min_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostMtView());
       }
-      partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
       ASSERT_EQ(partitioner.Size(), 3);
       ASSERT_EQ(partitioner[1].Size(), 0);
       ASSERT_EQ(partitioner[2].Size(), n_samples);
@@ -134,11 +144,14 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
       if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, mid_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostScView());
       } else {
         GetMultiSplitForTest(&tree, mid_value, &candidates);
+        partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                tree.HostMtView());
       }
       auto left_nidx = tree.LeftChild(RegTree::kRoot);
-      partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
 
       {
         auto const& elem = partitioner[left_nidx];
@@ -189,10 +202,13 @@ void TestColumnSplitPartitioner(bst_target_t n_targets) {
     RegTree tree{n_targets, n_features};
     if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
       GetSplit(&tree, mid_value, &candidates);
+      mid_partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                  tree.HostScView());
     } else {
       GetMultiSplitForTest(&tree, mid_value, &candidates);
+      mid_partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates,
+                                                  tree.HostMtView());
     }
-    mid_partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
   }
 
   auto constexpr kWorkers = 4;
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 3a1d38ac13b1..22137495e8b4 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -61,17 +61,18 @@ TEST(Tree, AllocateNode) {
 }
 
 TEST(Tree, ExpandCategoricalFeature) {
+  Context ctx;
   {
     RegTree tree;
     tree.ExpandCategorical(0, 0, {}, true, 1.0, 2.0, 3.0, 11.0, 2.0,
                            /*left_sum=*/3.0, /*right_sum=*/4.0);
     ASSERT_EQ(tree.GetNodes().size(), 3ul);
     ASSERT_EQ(tree.GetNumLeaves(), 2);
-    ASSERT_EQ(tree.GetSplitTypes().size(), 3ul);
-    ASSERT_EQ(tree.GetSplitTypes()[0], FeatureType::kCategorical);
-    ASSERT_EQ(tree.GetSplitTypes()[1], FeatureType::kNumerical);
-    ASSERT_EQ(tree.GetSplitTypes()[2], FeatureType::kNumerical);
-    ASSERT_EQ(tree.GetSplitCategories().size(), 0ul);
+    ASSERT_EQ(tree.GetSplitTypes(ctx.Device()).size(), 3ul);
+    ASSERT_EQ(tree.GetSplitTypes(ctx.Device())[0], FeatureType::kCategorical);
+    ASSERT_EQ(tree.GetSplitTypes(ctx.Device())[1], FeatureType::kNumerical);
+    ASSERT_EQ(tree.GetSplitTypes(ctx.Device())[2], FeatureType::kNumerical);
+    ASSERT_EQ(tree.GetSplitCategories(ctx.Device()).size(), 0ul);
     ASSERT_EQ(tree[0].SplitCond(), DftBadValue());
   }
   {
@@ -82,7 +83,7 @@ TEST(Tree, ExpandCategoricalFeature) {
     bitset.Set(cat);
     tree.ExpandCategorical(0, 0, split_cats, true, 1.0, 2.0, 3.0, 11.0, 2.0,
                            /*left_sum=*/3.0, /*right_sum=*/4.0);
-    auto categories = tree.GetSplitCategories();
+    auto categories = tree.GetSplitCategories(ctx.Device());
     auto segments = tree.GetSplitCategoriesPtr();
     auto got = categories.subspan(segments[0].beg, segments[0].size);
     ASSERT_TRUE(std::equal(got.cbegin(), got.cend(), split_cats.cbegin()));
@@ -98,7 +99,7 @@ TEST(Tree, ExpandCategoricalFeature) {
     ASSERT_EQ(cat_ptr[0].beg, 0ul);
     ASSERT_EQ(cat_ptr[0].size, 2ul);
 
-    auto loaded_categories = loaded_tree.GetSplitCategories();
+    auto loaded_categories = loaded_tree.GetSplitCategories(ctx.Device());
     auto loaded_root = loaded_categories.subspan(cat_ptr[0].beg, cat_ptr[0].size);
     ASSERT_TRUE(std::equal(loaded_root.begin(), loaded_root.end(), split_cats.begin()));
   }
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index e965fe6711e2..920c3e6af99c 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -119,32 +119,35 @@ class TestSplitWithEta : public ::testing::Test {
     CHECK_GE(p_tree0->NumExtraNodes(), 32);
 
     bst_node_t n_nodes{0};
-    tree::WalkTree(*p_tree0, [&](auto const&, bst_node_t nidx) {
-      if (p_tree0->IsLeaf(nidx)) {
-        CHECK(p_tree1->IsLeaf(nidx));
-        if (p_tree0->IsMultiTarget()) {
-          CHECK(p_tree1->IsMultiTarget());
-          auto leaf_0 = p_tree0->GetMultiTargetTree()->LeafValue(nidx);
-          auto leaf_1 = p_tree1->GetMultiTargetTree()->LeafValue(nidx);
-          CHECK_EQ(leaf_0.Size(), leaf_1.Size());
-          for (std::size_t i = 0; i < leaf_0.Size(); ++i) {
-            CHECK_EQ(leaf_0(i) * eta_ratio, leaf_1(i));
+    tree::WalkTree(
+        *p_tree0,
+        [&](auto const& tree0, auto const& tree1, bst_node_t nidx) {
+          if (tree0.IsLeaf(nidx)) {
+            CHECK(tree1.IsLeaf(nidx));
+            if (p_tree0->IsMultiTarget()) {
+              CHECK(p_tree1->IsMultiTarget());
+              auto leaf_0 = p_tree0->GetMultiTargetTree()->LeafValue(nidx);
+              auto leaf_1 = p_tree1->GetMultiTargetTree()->LeafValue(nidx);
+              CHECK_EQ(leaf_0.Size(), leaf_1.Size());
+              for (std::size_t i = 0; i < leaf_0.Size(); ++i) {
+                CHECK_EQ(leaf_0(i) * eta_ratio, leaf_1(i));
+              }
+              CHECK_EQ(DftBadValue(), tree0.SplitCond(nidx));
+              CHECK_EQ(DftBadValue(), tree1.SplitCond(nidx));
+            } else {
+              // NON-mt tree reuses split cond for leaf value.
+              auto leaf_0 = tree0.SplitCond(nidx);
+              auto leaf_1 = tree1.SplitCond(nidx);
+              CHECK_EQ(leaf_0 * eta_ratio, leaf_1);
+            }
+          } else {
+            CHECK(!tree1.IsLeaf(nidx));
+            CHECK_EQ(tree0.SplitCond(nidx), tree1.SplitCond(nidx));
           }
-          CHECK_EQ(DftBadValue(), p_tree0->SplitCond(nidx));
-          CHECK_EQ(DftBadValue(), p_tree1->SplitCond(nidx));
-        } else {
-          // NON-mt tree reuses split cond for leaf value.
-          auto leaf_0 = p_tree0->SplitCond(nidx);
-          auto leaf_1 = p_tree1->SplitCond(nidx);
-          CHECK_EQ(leaf_0 * eta_ratio, leaf_1);
-        }
-      } else {
-        CHECK(!p_tree1->IsLeaf(nidx));
-        CHECK_EQ(p_tree0->SplitCond(nidx), p_tree1->SplitCond(nidx));
-      }
-      n_nodes++;
-      return true;
-    });
+          n_nodes++;
+          return true;
+        },
+        *p_tree1);
     ASSERT_EQ(n_nodes, p_tree0->NumExtraNodes() + 1);
   }
 };

From 425d136e81a4cf9d839251585f7e09a9540198fd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 16 Oct 2025 17:21:12 +0800
Subject: [PATCH 198/224] Replace the device model. (#11752)

- Support leaf, QDM, inplace prediction for multi-target.
- Replace device model with tree-internal storage.
- Cleanup predict kernel.
---
 include/xgboost/tree_model.h               |  16 +-
 plugin/sycl/predictor/predictor.cc         |   4 +-
 python-package/xgboost/testing/ordinal.py  |   6 +-
 src/predictor/cpu_predictor.cc             |   2 +-
 src/predictor/gpu_predictor.cu             | 741 ++++++++-------------
 src/tree/hist/evaluate_splits.h            |   2 +-
 src/tree/tree_view.cc                      |   4 +-
 src/tree/tree_view.h                       |  18 +-
 src/tree/updater_gpu_hist.cu               |   3 +-
 tests/cpp/helpers.h                        |   4 +
 tests/cpp/plugin/test_sycl_hist_updater.cc |   4 +-
 tests/cpp/predictor/test_cpu_predictor.cc  |   2 +-
 tests/cpp/predictor/test_predictor.cc      |  40 +-
 tests/cpp/tree/test_tree_model.cc          |  10 +-
 14 files changed, 353 insertions(+), 503 deletions(-)

diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index fcf283882f8b..18656ac23b59 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -262,12 +262,18 @@ class RegTree : public Model {
   Node& operator[](bst_node_t nidx) { return nodes_.HostVector()[nidx]; }
 
  public:
-  /*! \brief get const reference to nodes */
-  [[nodiscard]] const std::vector<Node>& GetNodes() const { return nodes_.ConstHostVector(); }
+  /** @brief Get const reference to nodes */
+  [[nodiscard]] common::Span<Node const> GetNodes(DeviceOrd device) const {
+    CHECK(!this->IsMultiTarget());
+    return device.IsCPU() ? nodes_.ConstHostSpan()
+                          : (nodes_.SetDevice(device), nodes_.ConstDeviceSpan());
+  }
 
-  /*! \brief get const reference to stats */
-  [[nodiscard]] const std::vector<RTreeNodeStat>& GetStats() const {
-    return stats_.ConstHostVector();
+  /** @brief Get const reference to stats */
+  [[nodiscard]] common::Span<RTreeNodeStat const> GetStats(DeviceOrd device) const {
+    CHECK(!this->IsMultiTarget());
+    return device.IsCPU() ? stats_.ConstHostSpan()
+                          : (stats_.SetDevice(device), stats_.ConstDeviceSpan());
   }
 
   /*! \brief get node statistics given nid */
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index 442b70adfddb..9147aa0f450c 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -75,13 +75,13 @@ class DeviceModel {
       if (model.trees[tree_idx]->HasCategoricalSplit()) {
         LOG(FATAL) << "Categorical features are not yet supported by sycl";
       }
-      n_nodes += model.trees[tree_idx]->GetNodes().size();
+      n_nodes += model.trees[tree_idx]->Size();
       first_node_position_host[tree_idx - tree_begin + 1] = n_nodes;
     }
 
     nodes.Resize(n_nodes);
     for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto& src_nodes = model.trees[tree_idx]->GetNodes();
+      auto const& src_nodes = model.trees[tree_idx]->GetNodes(DeviceOrd::CPU());
       size_t n_nodes_shift = first_node_position_host[tree_idx - tree_begin];
       for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++) {
         nodes.HostVector()[node_idx + n_nodes_shift] = static_cast<Node>(src_nodes[node_idx]);
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index cbdcd665f2c8..6629868778f4 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -434,9 +434,11 @@ def run_thread_safety(DMatrixT: Type) -> bool:
         return True
 
     futures = []
+    n_cpus = os.cpu_count()
+    assert n_cpus is not None
     for dm in (DMatrix, QuantileDMatrix):
-        with ThreadPoolExecutor(max_workers=10) as e:
-            for _ in range(10):
+        with ThreadPoolExecutor(max_workers=max(n_cpus, 10)) as e:
+            for _ in range(32):
                 fut = e.submit(run_thread_safety, dm)
                 futures.append(fut)
 
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 03893ea4792b..f86224a11fcb 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -640,7 +640,7 @@ class ColumnSplitHelper {
     tree_offsets_.resize(n_trees);
     for (decltype(tree_begin) i = 0; i < n_trees; i++) {
       auto const &tree = *model_.trees[tree_begin_ + i];
-      tree_sizes_[i] = tree.GetNodes().size();
+      tree_sizes_[i] = tree.Size();
     }
     // std::exclusive_scan (only available in c++17) equivalent to get tree offsets.
     tree_offsets_[0] = 0;
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index ad47eb5ec86c..d35325413e59 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -12,13 +12,13 @@
 
 #include "../collective/allreduce.h"
 #include "../common/bitfield.h"
-#include "../tree/tree_view.h"
 #include "../common/categorical.h"
 #include "../common/common.h"
 #include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs, SetDevice
 #include "../common/device_helpers.cuh"
 #include "../common/error_msg.h"      // for InplacePredictProxy
+#include "../common/nvtx_utils.h"     // for xgboost_NVTX_FN_RANGE
 #include "../data/batch_utils.h"      // for StaticBatch
 #include "../data/cat_container.cuh"  // for EncPolicy
 #include "../data/device_adapter.cuh"
@@ -26,6 +26,7 @@
 #include "../data/proxy_dmatrix.cuh"  // for DispatchAny
 #include "../data/proxy_dmatrix.h"
 #include "../gbm/gbtree_model.h"
+#include "../tree/tree_view.h"
 #include "predict_fn.h"
 #include "utils.h"  // for CheckProxyDMatrix
 #include "xgboost/data.h"
@@ -40,34 +41,6 @@ DMLC_REGISTRY_FILE_TAG(gpu_predictor);
 
 using cuda_impl::StaticBatch;
 
-XGBOOST_DEVICE auto MakeScalarTreeView(
-    bst_tree_t tree_begin, bst_tree_t tree_idx, common::Span<const RegTree::Node> d_nodes,
-    common::Span<size_t const> d_tree_segments, common::Span<FeatureType const> d_tree_split_types,
-    common::Span<uint32_t const> d_cat_tree_segments,
-    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-    common::Span<uint32_t const> d_categories) {
-  auto begin = d_tree_segments[tree_idx - tree_begin];
-  auto n_nodes =
-      d_tree_segments[tree_idx - tree_begin + 1] - d_tree_segments[tree_idx - tree_begin];
-
-  common::Span<RegTree::Node const> d_tree = d_nodes.subspan(begin, n_nodes);
-
-  auto tree_cat_ptrs = d_cat_node_segments.subspan(begin, n_nodes);
-  auto tree_split_types = d_tree_split_types.subspan(begin, n_nodes);
-
-  auto tree_categories = d_categories.subspan(
-      d_cat_tree_segments[tree_idx - tree_begin],
-      d_cat_tree_segments[tree_idx - tree_begin + 1] - d_cat_tree_segments[tree_idx - tree_begin]);
-
-  RegTree::CategoricalSplitMatrix cats;
-  cats.split_type = tree_split_types;
-  cats.categories = tree_categories;
-  cats.node_ptr = tree_cat_ptrs;
-
-  auto tree = tree::ScalarTreeView{d_tree.data(), nullptr, cats, static_cast<bst_node_t>(n_nodes)};
-  return tree;
-}
-
 struct SparsePageView {
   common::Span<const Entry> d_data;
   common::Span<const bst_idx_t> d_row_ptr;
@@ -250,69 +223,23 @@ struct DeviceAdapterLoader {
   }
 };
 
-namespace multi {
-template <bool has_missing, bool has_categorical>
-XGBOOST_DEVICE bst_node_t GetNextNode(tree::MultiTargetTreeView const& tree, bst_node_t const nidx,
-                                      float fvalue, bool is_missing) {
-  if (has_missing && is_missing) {
-    return tree.DefaultChild(nidx);
-  } else {
-    return fvalue < tree.SplitCond(nidx) ? tree.LeftChild(nidx) : tree.RightChild(nidx);
-  }
-}
-
-template <bool has_missing, bool has_categorical, typename Loader>
-__device__ bst_node_t GetLeafIndex(bst_idx_t ridx, tree::MultiTargetTreeView const& tree,
-                                   Loader* loader) {
+namespace {
+template <bool has_missing, bool has_categorical, typename TreeView, typename Loader>
+__device__ bst_node_t GetLeafIndex(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
   bst_node_t nidx = 0;
   while (!tree.IsLeaf(nidx)) {
     float fvalue = loader->GetElement(ridx, tree.SplitIndex(nidx));
-    bool is_missing = common::CheckNAN(fvalue);
-    auto next = GetNextNode<has_missing, has_categorical>(tree, nidx, fvalue, is_missing);
+    bool is_missing = has_missing && common::CheckNAN(fvalue);
+    auto next = GetNextNode<has_missing, has_categorical>(tree, nidx, fvalue, is_missing,
+                                                          tree.GetCategoriesMatrix());
     assert(nidx < next);
     nidx = next;
   }
   return nidx;
 }
 
-template <bool has_missing, typename Loader>
-__device__ auto GetLeafWeight(bst_idx_t ridx, tree::MultiTargetTreeView const& tree,
-                              Loader* loader) {
-  bst_node_t nidx = GetLeafIndex<has_missing, false>(ridx, tree, loader);
-  return tree.LeafValue(nidx);
-}
-
-template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
-__global__ void PredictKernel(Data data, bst_feature_t n_features,
-                              common::Span<tree::MultiTargetTreeView> trees, bool use_shared,
-                              float missing, linalg::MatrixView<float> d_out_predt,
-                              EncAccessor acc) {
-  for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), data.NumRows())) {
-    Loader loader{std::move(data), use_shared, n_features, data.NumRows(), missing, std::move(acc)};
-    for (auto const& tree : trees) {
-      auto leaf = GetLeafWeight<has_missing>(idx, tree, &loader);
-      for (std::size_t i = 0, n = leaf.Shape(0); i < n; ++i) {
-        d_out_predt(idx, i) += leaf(i);
-      }
-    }
-  }
-}
-}  // namespace multi
-
-namespace scalar {
-template <bool has_missing, bool has_categorical, typename Loader, typename TreeView>
-__device__ bst_node_t GetLeafIndex(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
-  bst_node_t nidx = 0;
-  while (!tree.IsLeaf(nidx)) {
-    float fvalue = loader->GetElement(ridx, tree.SplitIndex(nidx));
-    bool is_missing = common::CheckNAN(fvalue);
-    nidx = GetNextNode<has_missing, has_categorical>(tree, nidx, fvalue, is_missing, tree.cats);
-  }
-  return nidx;
-}
-
-template <bool has_missing, typename Loader, typename TreeView>
-__device__ float GetLeafWeight(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
+template <bool has_missing, typename TreeView, typename Loader>
+__device__ auto GetLeafWeight(bst_idx_t ridx, TreeView const& tree, Loader* loader) {
   bst_node_t nidx = -1;
   if (tree.HasCategoricalSplit()) {
     nidx = GetLeafIndex<has_missing, true>(ridx, tree, loader);
@@ -321,195 +248,114 @@ __device__ float GetLeafWeight(bst_idx_t ridx, TreeView const& tree, Loader* loa
   }
   return tree.LeafValue(nidx);
 }
-}  // namespace scalar
+}  // namespace
+
+using TreeViewVar = cuda::std::variant<tree::ScalarTreeView, tree::MultiTargetTreeView>;
 
 template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
-__global__ void
-PredictLeafKernel(Data data, common::Span<const RegTree::Node> d_nodes,
-                  common::Span<float> d_out_predictions,
-                  common::Span<size_t const> d_tree_segments,
-
-                  common::Span<FeatureType const> d_tree_split_types,
-                  common::Span<uint32_t const> d_cat_tree_segments,
-                  common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-                  common::Span<uint32_t const> d_categories,
-
-                  bst_tree_t tree_begin, bst_tree_t tree_end, bst_feature_t num_features,
-                  size_t num_rows, bool use_shared,
-                  float missing, EncAccessor acc) {
+__global__ void PredictLeafKernel(Data data, common::Span<TreeViewVar const> d_trees,
+                                  common::Span<float> d_out_predictions, bst_tree_t tree_begin,
+                                  bst_tree_t tree_end, bst_feature_t num_features, bool use_shared,
+                                  float missing, EncAccessor acc) {
   bst_idx_t ridx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (ridx >= num_rows) {
+  if (ridx >= data.NumRows()) {
     return;
   }
-  Loader loader{std::move(data), use_shared, num_features, num_rows, missing, std::move(acc)};
+  Loader loader{std::move(data), use_shared, num_features, data.NumRows(), missing, std::move(acc)};
   for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-    auto d_tree = MakeScalarTreeView(
-        tree_begin,          tree_idx,           d_nodes,
-        d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-        d_cat_node_segments, d_categories);
-
-    bst_node_t leaf = -1;
-    if (d_tree.HasCategoricalSplit()) {
-      leaf = scalar::GetLeafIndex<has_missing, true>(ridx, d_tree, &loader);
-    } else {
-      leaf = scalar::GetLeafIndex<has_missing, false>(ridx, d_tree, &loader);
-    }
-    d_out_predictions[ridx * (tree_end - tree_begin) + tree_idx] = leaf;
+    auto const& d_tree = d_trees[tree_idx - tree_begin];
+    cuda::std::visit(
+        [&](auto&& tree) {
+          bst_node_t leaf = -1;
+          if (tree.HasCategoricalSplit()) {
+            leaf = GetLeafIndex<has_missing, true>(ridx, tree, &loader);
+          } else {
+            leaf = GetLeafIndex<has_missing, false>(ridx, tree, &loader);
+          }
+          d_out_predictions[ridx * (tree_end - tree_begin) + tree_idx] = leaf;
+        },
+        d_tree);
   }
 }
 
 template <typename Loader, typename Data, bool has_missing, typename EncAccessor>
-__global__ void
-PredictKernel(Data data, common::Span<const RegTree::Node> d_nodes,
-              common::Span<float> d_out_predictions,
-              common::Span<size_t const> d_tree_segments,
-              common::Span<int const> d_tree_group,
-              common::Span<FeatureType const> d_tree_split_types,
-              common::Span<uint32_t const> d_cat_tree_segments,
-              common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-              common::Span<uint32_t const> d_categories, bst_tree_t tree_begin,
-              bst_tree_t tree_end, bst_feature_t num_features, bst_idx_t num_rows,
-              bool use_shared, int num_group, float missing, EncAccessor acc) {
-  bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-  Loader loader{std::move(data), use_shared, num_features, num_rows, missing, std::move(acc)};
-  if (global_idx >= num_rows) return;
-
-  if (num_group == 1) {
+__global__ void PredictKernel(Data data, common::Span<TreeViewVar const> d_trees,
+                              common::Span<float> d_out_predictions,
+                              common::Span<bst_target_t const> d_tree_groups,
+                              bst_feature_t num_features, bool use_shared, bst_target_t n_groups,
+                              float missing, EncAccessor acc) {
+  bst_idx_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  Loader loader{std::move(data), use_shared, num_features, data.NumRows(), missing, std::move(acc)};
+  if (global_idx >= data.NumRows()) {
+    return;
+  }
+
+  if (n_groups == 1u) {
     float sum = 0;
-    for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto d_tree = MakeScalarTreeView(
-          tree_begin,          tree_idx,           d_nodes,
-          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-          d_cat_node_segments, d_categories);
-      float leaf = scalar::GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
+    for (auto const& d_tree : d_trees) {
+      auto const& sc_tree = cuda::std::get<tree::ScalarTreeView>(d_tree);
+      float leaf = GetLeafWeight<has_missing>(global_idx, sc_tree, &loader);
       sum += leaf;
     }
     d_out_predictions[global_idx] += sum;
   } else {
-    for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      int tree_group = d_tree_group[tree_idx];
-      auto d_tree =
-          MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
-                             d_cat_tree_segments, d_cat_node_segments, d_categories);
-      bst_uint out_prediction_idx = global_idx * num_group + tree_group;
-      d_out_predictions[out_prediction_idx] +=
-          scalar::GetLeafWeight<has_missing>(global_idx, d_tree, &loader);
+    for (bst_tree_t tree_idx = 0, k = d_trees.size(); tree_idx < k; tree_idx++) {
+      // Both d_tree_group and d_tress are subset of trees.
+      auto tree_group = d_tree_groups[tree_idx];
+      auto const& d_tree = d_trees[tree_idx];
+      cuda::std::visit(
+          enc::Overloaded{[&](tree::ScalarTreeView const& tree) {
+                            auto leaf = GetLeafWeight<has_missing>(global_idx, tree, &loader);
+                            bst_idx_t out_prediction_idx = global_idx * n_groups + tree_group;
+                            d_out_predictions[out_prediction_idx] += leaf;
+                          },
+                          [&](tree::MultiTargetTreeView const& tree) {
+                            // Tree group is 0.
+                            auto leaf = GetLeafWeight<has_missing>(global_idx, tree, &loader);
+                            for (std::size_t i = 0, n = leaf.Shape(0); i < n; ++i) {
+                              bst_idx_t out_prediction_idx = global_idx * n_groups + i;
+                              d_out_predictions[out_prediction_idx] += leaf(i);
+                            }
+                          }},
+          d_tree);
     }
   }
 }
 
-class DeviceModel {
- public:
-  // Need to lazily construct the vectors because GPU id is only known at runtime
-  HostDeviceVector<RTreeNodeStat> stats;
-  HostDeviceVector<size_t> tree_segments;
-  HostDeviceVector<RegTree::Node> nodes;
-  HostDeviceVector<int> tree_group;
-  HostDeviceVector<FeatureType> split_types;
-
-  // Pointer to each tree, segmenting the node array.
-  HostDeviceVector<uint32_t> categories_tree_segments;
-  // Pointer to each node, segmenting categories array.
-  HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment> categories_node_segments;
-  HostDeviceVector<uint32_t> categories;
-
-  bst_tree_t tree_beg_;  // NOLINT
-  bst_tree_t tree_end_;  // NOLINT
-  int num_group;
-  CatContainer const* cat_enc{nullptr};
-  bst_feature_t n_features{0};
-
-  [[nodiscard]] std::size_t MemCostBytes() const {
-    std::size_t n_bytes = 0;
-    n_bytes += stats.ConstDeviceSpan().size_bytes();
-    n_bytes += tree_segments.ConstDeviceSpan().size_bytes();
-    n_bytes += nodes.ConstDeviceSpan().size_bytes();
-    n_bytes += tree_group.ConstDeviceSpan().size_bytes();
-    n_bytes += split_types.ConstDeviceSpan().size_bytes();
-    n_bytes += categories_tree_segments.ConstDeviceSpan().size_bytes();
-    n_bytes += categories_node_segments.ConstDeviceSpan().size_bytes();
-    n_bytes += categories.ConstDeviceSpan().size_bytes();
-    n_bytes += sizeof(tree_beg_) + sizeof(tree_end_) + sizeof(num_group) + sizeof(cat_enc);
-    return n_bytes;
-  }
-
-  void Init(const gbm::GBTreeModel& model, bst_tree_t tree_begin, bst_tree_t tree_end,
-            DeviceOrd device) {
-    dh::safe_cuda(cudaSetDevice(device.ordinal));
-
-    // Copy decision trees to device
-    tree_segments = HostDeviceVector<size_t>({}, device);
-    auto& h_tree_segments = tree_segments.HostVector();
-    h_tree_segments.reserve((tree_end - tree_begin) + 1);
-    size_t sum = 0;
-    h_tree_segments.push_back(sum);
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      sum += model.trees.at(tree_idx)->GetNodes().size();
-      h_tree_segments.push_back(sum);
-    }
-
-    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
-    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
-    auto d_nodes = nodes.DevicePointer();
-    auto d_stats = stats.DevicePointer();
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
-      auto& src_stats = model.trees.at(tree_idx)->GetStats();
-      dh::safe_cuda(cudaMemcpyAsync(d_nodes + h_tree_segments[tree_idx - tree_begin],
-                                    src_nodes.data(), sizeof(RegTree::Node) * src_nodes.size(),
-                                    cudaMemcpyDefault));
-      dh::safe_cuda(cudaMemcpyAsync(d_stats + h_tree_segments[tree_idx - tree_begin],
-                                    src_stats.data(), sizeof(RTreeNodeStat) * src_stats.size(),
-                                    cudaMemcpyDefault));
-    }
-
-    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
-    auto& h_tree_group = tree_group.HostVector();
-    std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
-
-    // Initialize categorical splits.
-    split_types.SetDevice(device);
-    std::vector<FeatureType>& h_split_types = split_types.HostVector();
-    h_split_types.resize(h_tree_segments.back());
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      auto const& src_st = model.trees.at(tree_idx)->GetSplitTypes(DeviceOrd::CPU());
-      std::copy(src_st.cbegin(), src_st.cend(),
-                h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
-    }
-
-    categories = HostDeviceVector<uint32_t>({}, device);
-    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
-    std::vector<uint32_t>& h_categories = categories.HostVector();
-    std::vector<uint32_t>& h_split_cat_segments = categories_tree_segments.HostVector();
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      auto const& src_cats = model.trees.at(tree_idx)->GetSplitCategories(DeviceOrd::CPU());
-      size_t orig_size = h_categories.size();
-      h_categories.resize(orig_size + src_cats.size());
-      std::copy(src_cats.cbegin(), src_cats.cend(), h_categories.begin() + orig_size);
-      h_split_cat_segments.push_back(h_categories.size());
-    }
+struct DeviceModel {
+  bst_tree_t tree_begin;
+  bst_tree_t tree_end;
+  dh::device_vector<TreeViewVar> d_trees;
+  dh::device_vector<bst_target_t> d_tree_groups;
+  bst_target_t n_groups;
+  bst_feature_t n_features;
+  bst_node_t n_nodes{0};
 
-    categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
-        h_tree_segments.back(), {}, device);
-    std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
-        categories_node_segments.HostVector();
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      auto const &src_cats_ptr = model.trees.at(tree_idx)->GetSplitCategoriesPtr();
-      std::copy(src_cats_ptr.cbegin(), src_cats_ptr.cend(),
-                h_categories_node_segments.begin() +
-                    h_tree_segments[tree_idx - tree_begin]);
+ public:
+  explicit DeviceModel(Context const* ctx, gbm::GBTreeModel const& model, bst_tree_t tree_begin,
+                       bst_tree_t tree_end, std::mutex* p_mu)
+      : tree_begin{tree_begin},
+        tree_end{tree_end},
+        n_groups{model.learner_model_param->OutputLength()},
+        n_features{model.learner_model_param->num_feature} {
+    std::lock_guard guard{*p_mu};
+    std::vector<TreeViewVar> trees;
+    for (bst_tree_t tree_idx = this->tree_begin; tree_idx < this->tree_end; ++tree_idx) {
+      auto const& p_tree = model.trees[tree_idx];
+      if (p_tree->IsMultiTarget()) {
+        auto d_tree = tree::MultiTargetTreeView{ctx, p_tree.get()};
+        this->n_nodes += d_tree.Size();
+        trees.emplace_back(d_tree);
+      } else {
+        auto d_tree = tree::ScalarTreeView{ctx, p_tree.get()};
+        this->n_nodes += d_tree.Size();
+        trees.emplace_back(d_tree);
+      }
     }
 
-    this->tree_beg_ = tree_begin;
-    this->tree_end_ = tree_end;
-    this->num_group = model.learner_model_param->OutputLength();
-
-    this->cat_enc = model.Cats();
-    CHECK(this->cat_enc);
-    this->n_features = model.learner_model_param->num_feature;
-
-    auto n_bytes = this->MemCostBytes();  // Pull data to device, and get the size of the model.
-    LOG(DEBUG) << "Model size:" << common::HumanMemUnit(n_bytes);
+    this->d_trees = trees;
+    this->d_tree_groups = model.tree_info;
+    CHECK_GT(this->tree_end, this->tree_begin);
   }
 };
 
@@ -577,46 +423,74 @@ struct ShapSplitCondition {
 };
 
 struct PathInfo {
-  int64_t leaf_position;  // -1 not a leaf
-  size_t length;
+  std::size_t length;
+  // Node index in tree.
+  // -1 if not a leaf (internal split node)
+  bst_node_t nidx;
   bst_tree_t tree_idx;
+
+  [[nodiscard]] XGBOOST_DEVICE bool IsLeaf() const { return nidx != -1; }
 };
+static_assert(sizeof(PathInfo) == 16);
+
+auto MakeTreeSegments(Context const* ctx, bst_tree_t tree_begin, bst_tree_t tree_end,
+                      gbm::GBTreeModel const& model) {
+  // Copy decision trees to device
+  auto tree_segments = HostDeviceVector<size_t>({}, ctx->Device());
+  auto& h_tree_segments = tree_segments.HostVector();
+  h_tree_segments.reserve((tree_end - tree_begin) + 1);
+  std::size_t sum = 0;
+  h_tree_segments.push_back(sum);
+  for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+    auto const& p_tree = model.trees.at(tree_idx);
+    CHECK(!p_tree->IsMultiTarget()) << " SHAP " << MTNotImplemented();
+    sum += p_tree->Size();
+    h_tree_segments.push_back(sum);
+  }
+  return tree_segments;
+}
 
 // Transform model into path element form for GPUTreeShap
 void ExtractPaths(Context const* ctx,
                   dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
-                  DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
-                  DeviceOrd device) {
-  curt::SetDevice(device.ordinal);
-  auto& device_model = *model;
-
-  dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
-  auto d_nodes = device_model.nodes.ConstDeviceSpan();
-  auto d_tree_segments = device_model.tree_segments.ConstDeviceSpan();
-  auto nodes_transform = dh::MakeIndexTransformIter(
+                  gbm::GBTreeModel const& h_model, DeviceModel const& d_model,
+                  dh::device_vector<uint32_t>* path_categories) {
+  curt::SetDevice(ctx->Ordinal());
+
+  // Path length and tree index for all leaf nodes
+  dh::caching_device_vector<PathInfo> info(d_model.n_nodes);
+  auto d_trees = dh::ToSpan(d_model.d_trees);
+  auto tree_segments = MakeTreeSegments(ctx, d_model.tree_begin, d_model.tree_end, h_model);
+  CHECK_EQ(tree_segments.ConstHostVector().back(), d_model.n_nodes);
+  auto d_tree_segments = tree_segments.ConstDeviceSpan();
+
+  auto path_it = dh::MakeIndexTransformIter(
       cuda::proclaim_return_type<PathInfo>([=] __device__(size_t idx) -> PathInfo {
-        auto n = d_nodes[idx];
-        if (!n.IsLeaf() || n.IsDeleted()) {
-          return PathInfo{-1, 0, 0};
+        bst_tree_t const tree_idx = dh::SegmentId(d_tree_segments, idx);
+        bst_node_t const nidx = idx - d_tree_segments[tree_idx];
+        auto const& tree = cuda::std::get<tree::ScalarTreeView>(d_trees[tree_idx]);
+        if (!tree.IsLeaf(nidx) || tree.IsDeleted(nidx)) {
+          // -1 if it's an internal split node
+          return PathInfo{0, -1, 0};
         }
-        bst_tree_t tree_idx = dh::SegmentId(d_tree_segments.begin(), d_tree_segments.end(), idx);
-        size_t tree_offset = d_tree_segments[tree_idx];
-        size_t path_length = 1;
-        while (!n.IsRoot()) {
-          n = d_nodes[n.Parent() + tree_offset];
+        // Get the path length for leaf
+        std::size_t path_length = 1;
+        auto iter_nidx = nidx;
+        while (!tree.IsRoot(iter_nidx)) {
+          iter_nidx = tree.Parent(iter_nidx);
           path_length++;
         }
-        return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
+        return PathInfo{path_length, nidx, tree_idx};
       }));
-  auto end = thrust::copy_if(ctx->CUDACtx()->CTP(), nodes_transform,
-                             nodes_transform + d_nodes.size(), info.begin(),
-                             cuda::proclaim_return_type<bool>([=] __device__(const PathInfo& e) {
-                               return e.leaf_position != -1;
-                             }));
+  auto end = thrust::copy_if(
+      ctx->CUDACtx()->CTP(), path_it, path_it + d_model.n_nodes, info.begin(),
+      cuda::proclaim_return_type<bool>([=] __device__(PathInfo const& e) { return e.IsLeaf(); }));
+
   info.resize(end - info.begin());
-  auto length_iterator = dh::MakeTransformIterator<size_t>(
-      info.begin(), cuda::proclaim_return_type<decltype(std::declval<PathInfo>().length)>(
-                        [=] __device__(const PathInfo& info) { return info.length; }));
+  using LenT = decltype(std::declval<PathInfo>().length);
+  auto length_iterator = dh::MakeTransformIterator<LenT>(
+      info.begin(), cuda::proclaim_return_type<LenT>(
+                        [=] __device__(PathInfo const& info) { return info.length; }));
   dh::caching_device_vector<size_t> path_segments(info.size() + 1);
   thrust::exclusive_scan(ctx->CUDACtx()->CTP(), length_iterator, length_iterator + info.size() + 1,
                          path_segments.begin());
@@ -625,80 +499,75 @@ void ExtractPaths(Context const* ctx,
 
   auto d_paths = dh::ToSpan(*paths);
   auto d_info = info.data().get();
-  auto d_stats = device_model.stats.ConstDeviceSpan();
-  auto d_tree_group = device_model.tree_group.ConstDeviceSpan();
+  auto d_tree_groups = dh::ToSpan(d_model.d_tree_groups);
   auto d_path_segments = path_segments.data().get();
 
-  auto d_split_types = device_model.split_types.ConstDeviceSpan();
-  auto d_cat_segments = device_model.categories_tree_segments.ConstDeviceSpan();
-  auto d_cat_node_segments = device_model.categories_node_segments.ConstDeviceSpan();
-
   std::size_t max_cat = 0;
-  if (thrust::any_of(ctx->CUDACtx()->CTP(), dh::tbegin(d_split_types), dh::tend(d_split_types),
-                     common::IsCatOp{})) {
-    dh::PinnedMemory pinned;
-    auto h_max_cat = pinned.GetSpan<RegTree::CategoricalSplitMatrix::Segment>(1);
-    auto max_elem_it = dh::MakeTransformIterator<size_t>(
-        dh::tbegin(d_cat_node_segments),
-        [] __device__(RegTree::CategoricalSplitMatrix::Segment seg) { return seg.size; });
-    size_t max_cat_it = thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it,
-                                            max_elem_it + d_cat_node_segments.size()) -
-                        max_elem_it;
-    dh::safe_cuda(cudaMemcpy(h_max_cat.data(), d_cat_node_segments.data() + max_cat_it,
-                             h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
-    max_cat = h_max_cat[0].size;
+  if (std::any_of(h_model.trees.cbegin(), h_model.trees.cend(),
+                  [](auto const& p_tree) { return p_tree->HasCategoricalSplit(); })) {
+    auto max_elem_it = dh::MakeIndexTransformIter([=] __device__(std::size_t i) -> std::size_t {
+      auto tree_idx = dh::SegmentId(d_tree_segments, i);
+      auto nidx = i - d_tree_segments[tree_idx];
+      return cuda::std::get<tree::ScalarTreeView>(d_trees[tree_idx])
+          .GetCategoriesMatrix()
+          .node_ptr[nidx]
+          .size;
+    });
+    auto max_cat_it =
+        thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it, max_elem_it + d_model.n_nodes);
+    dh::CachingDeviceUVector<std::size_t> d_max_cat(1);
+    auto s_max_cat = dh::ToSpan(d_max_cat);
+    dh::LaunchN(1, ctx->CUDACtx()->Stream(),
+                [=] __device__(std::size_t) { s_max_cat[0] = *max_cat_it; });
+    dh::safe_cuda(
+        cudaMemcpy(&max_cat, s_max_cat.data(), s_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
     CHECK_GE(max_cat, 1);
     path_categories->resize(max_cat * paths->size());
   }
 
-  auto d_model_categories = device_model.categories.DeviceSpan();
   common::Span<uint32_t> d_path_categories = dh::ToSpan(*path_categories);
 
   dh::LaunchN(info.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
     auto path_info = d_info[idx];
-    size_t tree_offset = d_tree_segments[path_info.tree_idx];
-    auto tree = MakeScalarTreeView(0, path_info.tree_idx, d_nodes, d_tree_segments, d_split_types,
-                                   d_cat_segments, d_cat_node_segments, d_model_categories);
-    int group = d_tree_group[path_info.tree_idx];
-    size_t child_idx = path_info.leaf_position;
-    auto child = d_nodes[child_idx];
-    float v = child.LeafValue();
+    auto tree = cuda::std::get<tree::ScalarTreeView>(d_trees[path_info.tree_idx]);
+    std::int32_t group = d_tree_groups[path_info.tree_idx];
+    auto child_nidx = path_info.nidx;
+
+    float v = tree.LeafValue(child_nidx);
     const float inf = std::numeric_limits<float>::infinity();
     size_t output_position = d_path_segments[idx + 1] - 1;
-    while (!child.IsRoot()) {
-      size_t parent_idx = tree_offset + child.Parent();
-      double child_cover = d_stats[child_idx].sum_hess;
-      double parent_cover = d_stats[parent_idx].sum_hess;
+
+    while (!tree.IsRoot(child_nidx)) {
+      auto parent_nidx = tree.Parent(child_nidx);
+      double child_cover = tree.SumHess(child_nidx);
+      double parent_cover = tree.SumHess(parent_nidx);
       double zero_fraction = child_cover / parent_cover;
-      auto pnidx = child.Parent();
 
-      bool is_left_path = (tree_offset + tree.LeftChild(pnidx)) == child_idx;
-      bool is_missing_path =
-          (!tree.DefaultLeft(pnidx) && !is_left_path) || (tree.DefaultLeft(pnidx) && is_left_path);
+      bool is_left_path = tree.LeftChild(parent_nidx) == child_nidx;
+      bool is_missing_path = (!tree.DefaultLeft(parent_nidx) && !is_left_path) ||
+                             (tree.DefaultLeft(parent_nidx) && is_left_path);
 
       float lower_bound = -inf;
       float upper_bound = inf;
       common::CatBitField bits;
-      if (common::IsCat(tree.cats.split_type, child.Parent())) {
+      if (common::IsCat(tree.cats.split_type, tree.Parent(child_nidx))) {
         auto path_cats = d_path_categories.subspan(max_cat * output_position, max_cat);
-        size_t size = tree.cats.node_ptr[child.Parent()].size;
-        auto node_cats = tree.cats.categories.subspan(tree.cats.node_ptr[child.Parent()].beg, size);
+        auto node_cats = tree.NodeCats(tree.Parent(child_nidx));
         SPAN_CHECK(path_cats.size() >= node_cats.size());
         for (size_t i = 0; i < node_cats.size(); ++i) {
           path_cats[i] = is_left_path ? ~node_cats[i] : node_cats[i];
         }
         bits = common::CatBitField{path_cats};
       } else {
-        lower_bound = is_left_path ? -inf : tree.SplitCond(pnidx);
-        upper_bound = is_left_path ? tree.SplitCond(pnidx) : inf;
+        lower_bound = is_left_path ? -inf : tree.SplitCond(parent_nidx);
+        upper_bound = is_left_path ? tree.SplitCond(parent_nidx) : inf;
       }
-      d_paths[output_position--] =
-          gpu_treeshap::PathElement<ShapSplitCondition>{
-              idx,           tree.SplitIndex(pnidx),
-              group,         ShapSplitCondition{lower_bound, upper_bound, is_missing_path, bits},
-              zero_fraction, v};
-      child_idx = parent_idx;
-      child = d_nodes[child_idx];
+      d_paths[output_position--] = gpu_treeshap::PathElement<ShapSplitCondition>{
+          idx,           tree.SplitIndex(parent_nidx),
+          group,         ShapSplitCondition{lower_bound, upper_bound, is_missing_path, bits},
+          zero_fraction, v};
+
+      child_nidx = parent_nidx;
     }
     // Root node has feature -1
     d_paths[output_position] = {idx, -1, group, ShapSplitCondition{-inf, inf, false, {}}, 1.0, v};
@@ -718,37 +587,31 @@ template <std::size_t kBlockThreads>
 
 using BitVector = LBitField64;
 
-__global__ void MaskBitVectorKernel(
-    SparsePageView data, common::Span<RegTree::Node const> d_nodes,
-    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
-    common::Span<FeatureType const> d_tree_split_types,
-    common::Span<std::uint32_t const> d_cat_tree_segments,
-    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
-    bst_tree_t tree_begin, bst_tree_t tree_end, bst_feature_t num_features, std::size_t num_rows,
-    std::size_t num_nodes, bool use_shared, float missing) {
+__global__ void MaskBitVectorKernel(SparsePageView data, common::Span<TreeViewVar const> d_trees,
+                                    BitVector decision_bits, BitVector missing_bits,
+                                    bst_tree_t tree_begin, bst_tree_t tree_end,
+                                    bst_feature_t num_features, std::size_t num_nodes,
+                                    bool use_shared, float missing) {
   // This needs to be always instantiated since the data is loaded cooperatively by all threads.
-  SparsePageLoader loader{data, use_shared, num_features, num_rows, missing, NoOpAccessor{}};
+  SparsePageLoader loader{data, use_shared, num_features, data.NumRows(), missing, NoOpAccessor{}};
   auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx >= num_rows) {
+  if (row_idx >= data.NumRows()) {
     return;
   }
 
   std::size_t tree_offset = 0;
   for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-    auto d_tree = MakeScalarTreeView(tree_begin,          tree_idx,           d_nodes,
-                                     d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                                     d_cat_node_segments, d_categories);
+    auto const& d_tree = cuda::std::get<tree::ScalarTreeView>(d_trees[tree_idx - tree_begin]);
     auto const tree_nodes = d_tree.Size();
     for (auto nid = 0; nid < tree_nodes; nid++) {
       if (d_tree.IsDeleted(nid) || d_tree.IsLeaf(nid)) {
-          continue;
+        continue;
       }
       auto const fvalue = loader.GetElement(row_idx, d_tree.SplitIndex(nid));
       auto const is_missing = common::CheckNAN(fvalue);
       auto const bit_index = row_idx * num_nodes + tree_offset + nid;
       if (is_missing) {
-          missing_bits.Set(bit_index);
+        missing_bits.Set(bit_index);
       } else {
         auto const decision =
             d_tree.HasCategoricalSplit()
@@ -791,15 +654,13 @@ __device__ float GetLeafWeightByBitVector(bst_idx_t ridx, TreeView const& tree,
 }
 
 template <bool predict_leaf>
-__global__ void PredictByBitVectorKernel(
-    common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
-    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
-    common::Span<FeatureType const> d_tree_split_types,
-    common::Span<std::uint32_t const> d_cat_tree_segments,
-    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
-    bst_tree_t tree_begin, bst_tree_t tree_end, std::size_t num_rows, std::size_t num_nodes,
-    std::uint32_t num_group) {
+__global__ void PredictByBitVectorKernel(common::Span<TreeViewVar const> d_trees,
+                                         common::Span<float> d_out_predictions,
+                                         common::Span<bst_target_t const> d_tree_groups,
+                                         BitVector decision_bits, BitVector missing_bits,
+                                         bst_tree_t tree_begin, bst_tree_t tree_end,
+                                         std::size_t num_rows, std::size_t num_nodes,
+                                         std::uint32_t num_group) {
   auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_idx >= num_rows) {
     return;
@@ -808,9 +669,7 @@ __global__ void PredictByBitVectorKernel(
   std::size_t tree_offset = 0;
   if constexpr (predict_leaf) {
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      auto d_tree =
-          MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
-                             d_cat_tree_segments, d_cat_node_segments, d_categories);
+      auto const& d_tree = cuda::std::get<tree::ScalarTreeView>(d_trees[tree_idx - tree_begin]);
       auto const leaf = GetLeafIndexByBitVector(row_idx, d_tree, decision_bits, missing_bits,
                                                 num_nodes, tree_offset);
       d_out_predictions[row_idx * (tree_end - tree_begin) + tree_idx] = static_cast<float>(leaf);
@@ -820,9 +679,7 @@ __global__ void PredictByBitVectorKernel(
     if (num_group == 1) {
       float sum = 0;
       for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-        auto d_tree =
-            MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
-                               d_cat_tree_segments, d_cat_node_segments, d_categories);
+        auto const& d_tree = cuda::std::get<tree::ScalarTreeView>(d_trees[tree_idx - tree_begin]);
         sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
                                         tree_offset);
         tree_offset += d_tree.Size();
@@ -830,10 +687,8 @@ __global__ void PredictByBitVectorKernel(
       d_out_predictions[row_idx] += sum;
     } else {
       for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-        auto const tree_group = d_tree_group[tree_idx];
-        auto d_tree =
-            MakeScalarTreeView(tree_begin, tree_idx, d_nodes, d_tree_segments, d_tree_split_types,
-                               d_cat_tree_segments, d_cat_node_segments, d_categories);
+        auto const tree_group = d_tree_groups[tree_idx - tree_begin];
+        auto const& d_tree = cuda::std::get<tree::ScalarTreeView>(d_trees[tree_idx - tree_begin]);
         bst_uint out_prediction_idx = row_idx * num_group + tree_group;
         d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
             row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
@@ -865,7 +720,7 @@ class ColumnSplitHelper {
   using BitType = BitVector::value_type;
 
   template <bool predict_leaf>
-  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
+  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& d_model,
                       bst_feature_t num_features, std::uint32_t num_group) const {
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     dh::caching_device_vector<BitType> decision_storage{};
@@ -877,7 +732,7 @@ class ColumnSplitHelper {
         SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
     auto const use_shared = shared_memory_bytes != 0;
 
-    auto const num_nodes = model.nodes.Size();
+    auto const num_nodes = d_model.n_nodes;
     std::size_t batch_offset = 0;
     for (auto const& batch : dmat->GetBatches<SparsePage>()) {
       auto const num_rows = batch.Size();
@@ -887,23 +742,19 @@ class ColumnSplitHelper {
 
       SparsePageView data{ctx_, batch, num_features};
       auto const grid = static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
+      auto d_tree_groups = dh::ToSpan(d_model.d_tree_groups)
+                               .subspan(d_model.tree_begin, d_model.tree_end - d_model.tree_begin);
       dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
-          MaskBitVectorKernel, data, model.nodes.ConstDeviceSpan(),
-          model.tree_segments.ConstDeviceSpan(), model.tree_group.ConstDeviceSpan(),
-          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
-          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
-          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_features, num_rows,
-          num_nodes, use_shared, std::numeric_limits<float>::quiet_NaN());
+          MaskBitVectorKernel, data, dh::ToSpan(d_model.d_trees), decision_bits, missing_bits,
+          d_model.tree_begin, d_model.tree_end, num_features, num_nodes, use_shared,
+          std::numeric_limits<float>::quiet_NaN());
 
       AllReduceBitVectors(&decision_storage, &missing_storage);
 
-      dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
-          PredictByBitVectorKernel<predict_leaf>, model.nodes.ConstDeviceSpan(),
-          out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
-          model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
-          model.categories_tree_segments.ConstDeviceSpan(),
-          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
-          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_rows, num_nodes,
+      dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()}(
+          PredictByBitVectorKernel<predict_leaf>, dh::ToSpan(d_model.d_trees),
+          out_preds->DeviceSpan().subspan(batch_offset), d_tree_groups,
+          decision_bits, missing_bits, d_model.tree_begin, d_model.tree_end, num_rows, num_nodes,
           num_group);
 
       batch_offset += batch.Size() * num_group;
@@ -1003,34 +854,11 @@ class LaunchConfig {
                            HostDeviceVector<float>* predictions) {
     auto kernel = PredictKernel<typename Loader::Type, common::GetValueT<decltype(batch)>,
                                 HasMissing(), EncAccessorT>;
-    this->Launch<Loader>(
-        kernel, std::move(batch), d_model.nodes.ConstDeviceSpan(),
-        predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
-
-        d_model.tree_group.ConstDeviceSpan(),
-
-        d_model.split_types.ConstDeviceSpan(), d_model.categories_tree_segments.ConstDeviceSpan(),
-        d_model.categories_node_segments.ConstDeviceSpan(), d_model.categories.ConstDeviceSpan(),
-
-        d_model.tree_beg_, d_model.tree_end_, n_features, batch.NumRows(), this->UseShared(),
-        d_model.num_group, missing, acc);
-  }
-
-  template <typename Loader, typename Data>
-  void LaunchMultiPredictKernel(Data batch, gbm::GBTreeModel const& model, bst_tree_t tree_begin,
-                                bst_tree_t tree_end, EncAccessorT acc, bst_idx_t batch_offset,
-                                HostDeviceVector<float>* predictions) {
-    CHECK_EQ(batch_offset, 0);  // external memory is not supported yet.
-    std::vector<tree::MultiTargetTreeView> h_trees;
-    for (bst_tree_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
-      h_trees.emplace_back(ctx_, model.trees[tree_idx].get());
-    }
-    dh::device_vector<tree::MultiTargetTreeView> trees = h_trees;
-    auto kernel = multi::PredictKernel<typename Loader::Type, Data, true, EncAccessorT>;
-    auto predt =
-        linalg::MakeTensorView(ctx_, predictions, batch.NumRows(), h_trees.front().NumTargets());
-    this->Launch<Loader>(kernel, std::move(batch), this->n_features_, dh::ToSpan(trees),
-                         this->UseShared(), std::numeric_limits<float>::quiet_NaN(), predt, acc);
+    auto d_tree_groups = dh::ToSpan(d_model.d_tree_groups)
+                             .subspan(d_model.tree_begin, d_model.tree_end - d_model.tree_begin);
+    this->Launch<Loader>(kernel, std::move(batch), dh::ToSpan(d_model.d_trees),
+                         predictions->DeviceSpan().subspan(batch_offset), d_tree_groups, n_features,
+                         this->UseShared(), d_model.n_groups, missing, acc);
   }
 
   [[nodiscard]] bool UseShared() const { return shared_memory_bytes_ != 0; }
@@ -1114,37 +942,43 @@ class LaunchConfig {
 
 template <typename Kernel>
 void LaunchPredict(Context const* ctx, bool is_dense, enc::DeviceColumnsView const& new_enc,
-                   DeviceModel const& model, Kernel&& launch) {
+                   gbm::GBTreeModel const& model, Kernel&& launch) {
   if (is_dense) {
-    if (model.cat_enc && model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
-      auto cfg = LaunchConfig<std::true_type, decltype(acc)>{ctx, model.n_features};
+    if (model.Cats() && model.Cats()->HasCategorical() && new_enc.HasCategorical()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.Cats());
+      auto cfg =
+          LaunchConfig<std::true_type, decltype(acc)>{ctx, model.learner_model_param->num_feature};
       launch(std::move(cfg), std::move(acc));
     } else {
-      auto cfg = LaunchConfig<std::true_type, NoOpAccessor>{ctx, model.n_features};
+      auto cfg =
+          LaunchConfig<std::true_type, NoOpAccessor>{ctx, model.learner_model_param->num_feature};
       launch(std::move(cfg), NoOpAccessor{});
     }
   } else {
-    if (model.cat_enc && model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
-      auto cfg = LaunchConfig<std::false_type, decltype(acc)>{ctx, model.n_features};
+    if (model.Cats() && model.Cats()->HasCategorical() && new_enc.HasCategorical()) {
+      auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.Cats());
+      auto cfg =
+          LaunchConfig<std::false_type, decltype(acc)>{ctx, model.learner_model_param->num_feature};
       launch(std::move(cfg), std::move(acc));
     } else {
-      auto cfg = LaunchConfig<std::false_type, NoOpAccessor>{ctx, model.n_features};
+      auto cfg =
+          LaunchConfig<std::false_type, NoOpAccessor>{ctx, model.learner_model_param->num_feature};
       launch(std::move(cfg), NoOpAccessor{});
     }
   }
 }
 
 template <typename Kernel>
-void LaunchShap(Context const* ctx, enc::DeviceColumnsView const& new_enc, DeviceModel const& model,
-                Kernel launch) {
-  if (model.cat_enc->HasCategorical() && new_enc.HasCategorical()) {
-    auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.cat_enc);
-    auto cfg = LaunchConfig<std::true_type, decltype(acc)>{ctx, model.n_features};
+void LaunchShap(Context const* ctx, enc::DeviceColumnsView const& new_enc,
+                gbm::GBTreeModel const& model, Kernel&& launch) {
+  if (model.Cats() && model.Cats()->HasCategorical() && new_enc.HasCategorical()) {
+    auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.Cats());
+    auto cfg =
+        LaunchConfig<std::true_type, decltype(acc)>{ctx, model.learner_model_param->num_feature};
     launch(std::move(cfg), std::move(acc));
   } else {
-    auto cfg = LaunchConfig<std::true_type, NoOpAccessor>{ctx, model.n_features};
+    auto cfg =
+        LaunchConfig<std::true_type, NoOpAccessor>{ctx, model.learner_model_param->num_feature};
     launch(std::move(cfg), NoOpAccessor{});
   }
 }
@@ -1161,10 +995,7 @@ class GPUPredictor : public xgboost::Predictor {
     out_preds->SetDevice(ctx_->Device());
     auto const& info = p_fmat->Info();
 
-    DeviceModel d_model;
-    if (!model.trees[tree_begin]->IsMultiTarget()) {
-      d_model.Init(model, tree_begin, tree_end, ctx_->Device());
-    }
+    DeviceModel d_model{this->ctx_, model, tree_begin, tree_end, &this->model_mu_};
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictBatch(p_fmat, out_preds, model, d_model);
@@ -1176,20 +1007,15 @@ class GPUPredictor : public xgboost::Predictor {
 
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
-    LaunchPredict(ctx_, p_fmat->IsDense(), new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+    LaunchPredict(ctx_, p_fmat->IsDense(), new_enc, model, [&](auto&& cfg, auto&& acc) {
       using Config = common::GetValueT<decltype(cfg)>;
 
       bst_idx_t batch_offset = 0;
       cfg.ForEachBatch(p_fmat, [&](auto&& loader_t, auto&& batch) {
         using Loader = typename common::GetValueT<decltype(loader_t)>;
-        if (model.trees[tree_begin]->IsMultiTarget()) {
-          cfg.template LaunchMultiPredictKernel<Loader>(std::move(batch), model, tree_begin,
-                                                        tree_end, acc, batch_offset, out_preds);
-        } else {
-          cfg.template LaunchPredictKernel<Loader>(
-              std::move(batch), std::numeric_limits<float>::quiet_NaN(), n_features, d_model, acc,
-              batch_offset, out_preds);
-        }
+        cfg.template LaunchPredictKernel<Loader>(std::move(batch),
+                                                 std::numeric_limits<float>::quiet_NaN(),
+                                                 n_features, d_model, acc, batch_offset, out_preds);
         batch_offset += batch.NumRows() * model.learner_model_param->OutputLength();
       });
     });
@@ -1206,6 +1032,7 @@ class GPUPredictor : public xgboost::Predictor {
 
   void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts, const gbm::GBTreeModel& model,
                     bst_tree_t tree_begin, bst_tree_t tree_end = 0) const override {
+    xgboost_NVTX_FN_RANGE();
     CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
     auto* out_preds = &predts->predictions;
     if (tree_end == 0) {
@@ -1229,13 +1056,12 @@ class GPUPredictor : public xgboost::Predictor {
     auto n_samples = m->NumRows();
     auto n_features = model.learner_model_param->num_feature;
 
-    DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, m->Device());
+    DeviceModel d_model{ctx_, model, tree_begin, tree_end, &this->model_mu_};
 
     if constexpr (std::is_same_v<Adapter, data::CudfAdapter>) {
       if (m->HasCategorical()) {
         auto new_enc = m->DCats();
-        LaunchPredict(this->ctx_, false, new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+        LaunchPredict(this->ctx_, false, new_enc, model, [&](auto&& cfg, auto&& acc) {
           using EncAccessor = std::remove_reference_t<decltype(acc)>;
           using LoaderImpl = DeviceAdapterLoader<BatchT, EncAccessor>;
           using Loader =
@@ -1248,7 +1074,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     }
 
-    LaunchPredict(this->ctx_, false, enc::DeviceColumnsView{}, d_model,
+    LaunchPredict(this->ctx_, false, enc::DeviceColumnsView{}, model,
                   [&](auto&& cfg, auto&& acc) {
                     using EncAccessor = std::remove_reference_t<decltype(acc)>;
                     CHECK((std::is_same_v<EncAccessor, NoOpAccessor>));
@@ -1265,6 +1091,7 @@ class GPUPredictor : public xgboost::Predictor {
   [[nodiscard]] bool InplacePredict(std::shared_ptr<DMatrix> p_m, gbm::GBTreeModel const& model,
                                     float missing, PredictionCacheEntry* out_preds,
                                     bst_tree_t tree_begin, bst_tree_t tree_end) const override {
+    xgboost_NVTX_FN_RANGE();
     auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
     CHECK(proxy) << error::InplacePredictProxy();
     bool type_error = false;
@@ -1282,6 +1109,7 @@ class GPUPredictor : public xgboost::Predictor {
                            const gbm::GBTreeModel& model, bst_tree_t tree_end,
                            std::vector<float> const* tree_weights, bool approximate, int,
                            unsigned) const override {
+    xgboost_NVTX_FN_RANGE();
     StringView not_implemented{
         "contribution is not implemented in the GPU predictor, use CPU instead."};
     if (approximate) {
@@ -1306,15 +1134,15 @@ class GPUPredictor : public xgboost::Predictor {
     auto phis = out_contribs->DeviceSpan();
 
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> device_paths;
-    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->Device());
+    DeviceModel d_model{this->ctx_, model, 0, tree_end, &this->model_mu_};
+
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
 
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
+    ExtractPaths(ctx_, &device_paths, model, d_model, &categories);
 
-    LaunchShap(this->ctx_, new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+    LaunchShap(this->ctx_, new_enc, model, [&](auto&& cfg, auto&& acc) {
       using Config = common::GetValueT<decltype(cfg)>;
       using EncAccessor = typename Config::EncAccessorT;
 
@@ -1342,6 +1170,7 @@ class GPUPredictor : public xgboost::Predictor {
                                        gbm::GBTreeModel const& model, bst_tree_t tree_end,
                                        std::vector<float> const* tree_weights,
                                        bool approximate) const override {
+    xgboost_NVTX_FN_RANGE();
     std::string not_implemented{
         "contribution is not implemented in GPU predictor, use cpu instead."};
     if (approximate) {
@@ -1365,14 +1194,14 @@ class GPUPredictor : public xgboost::Predictor {
     auto phis = out_contribs->DeviceSpan();
 
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> device_paths;
-    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->Device());
+    DeviceModel d_model{this->ctx_, model, 0, tree_end, &this->model_mu_};
+
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
+    ExtractPaths(ctx_, &device_paths, model, d_model, &categories);
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
 
-    LaunchShap(this->ctx_, new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+    LaunchShap(this->ctx_, new_enc, model, [&](auto&& cfg, auto&& acc) {
       using Config = common::GetValueT<decltype(cfg)>;
       using EncAccessor = typename Config::EncAccessorT;
 
@@ -1402,6 +1231,7 @@ class GPUPredictor : public xgboost::Predictor {
 
   void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<float>* predictions,
                    gbm::GBTreeModel const& model, bst_tree_t tree_end) const override {
+    xgboost_NVTX_FN_RANGE();
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
 
     const MetaInfo& info = p_fmat->Info();
@@ -1409,38 +1239,29 @@ class GPUPredictor : public xgboost::Predictor {
     tree_end = GetTreeLimit(model.trees, tree_end);
     predictions->SetDevice(ctx_->Device());
     predictions->Resize(n_samples * tree_end);
-    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, this->ctx_->Device());
+
+    DeviceModel d_model{ctx_, model, 0, tree_end, &this->model_mu_};
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
       return;
     }
 
-    bst_feature_t n_features = info.num_col_;
+    bst_feature_t n_features = model.learner_model_param->num_feature;
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
 
-    LaunchPredict(ctx_, p_fmat->IsDense(), new_enc, d_model, [&](auto&& cfg, auto&& acc) {
+    LaunchPredict(ctx_, p_fmat->IsDense(), new_enc, model, [&](auto&& cfg, auto&& acc) {
       bst_idx_t batch_offset = 0;
       cfg.ForEachBatch(p_fmat, [&](auto&& loader_t, auto&& batch) {
         using Loader = typename common::GetValueT<decltype(loader_t)>;
         using Config = common::GetValueT<decltype(cfg)>;
         auto kernel = PredictLeafKernel<typename Loader::Type, common::GetValueT<decltype(batch)>,
                                         Config::HasMissing(), typename Config::EncAccessorT>;
-
-        cfg.template Launch<Loader>(kernel, std::move(batch), d_model.nodes.ConstDeviceSpan(),
+        cfg.template Launch<Loader>(kernel, std::move(batch), dh::ToSpan(d_model.d_trees),
                                     predictions->DeviceSpan().subspan(batch_offset),
-                                    d_model.tree_segments.ConstDeviceSpan(),
-
-                                    d_model.split_types.ConstDeviceSpan(),
-                                    d_model.categories_tree_segments.ConstDeviceSpan(),
-                                    d_model.categories_node_segments.ConstDeviceSpan(),
-                                    d_model.categories.ConstDeviceSpan(),
-
-                                    d_model.tree_beg_, d_model.tree_end_, n_features,
-                                    batch.NumRows(), cfg.UseShared(),
-                                    std::numeric_limits<float>::quiet_NaN(),
+                                    d_model.tree_begin, d_model.tree_end, n_features,
+                                    cfg.UseShared(), std::numeric_limits<float>::quiet_NaN(),
                                     std::forward<typename Config::EncAccessorT>(acc));
 
         batch_offset += batch.NumRows();
@@ -1449,6 +1270,8 @@ class GPUPredictor : public xgboost::Predictor {
   }
 
  private:
+  // Prevent multiple threads from pulling the model to device together.
+  mutable std::mutex model_mu_;
   ColumnSplitHelper column_split_helper_;
 };
 
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 61d9ef34798f..cc64a8c13e07 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -447,7 +447,7 @@ class HistEvaluator {
                              tree[candidate.nid].SplitIndex(), left_weight, right_weight);
     evaluator = tree_evaluator_.GetEvaluator();
 
-    snode_.resize(tree.GetNodes().size());
+    snode_.resize(tree.Size());
     snode_.at(left_child).stats = candidate.split.left_sum;
     snode_.at(left_child).root_gain =
         evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.left_sum});
diff --git a/src/tree/tree_view.cc b/src/tree/tree_view.cc
index cad4164f1f9f..e01fd39796bc 100644
--- a/src/tree/tree_view.cc
+++ b/src/tree/tree_view.cc
@@ -31,8 +31,8 @@ auto DispatchWeight(DeviceOrd device, RegTree const* tree) {
 
 ScalarTreeView::ScalarTreeView(Context const* ctx, RegTree const* tree)
     : CategoriesMixIn{tree->GetCategoriesMatrix(ctx->Device())},
-      nodes{tree->GetNodes().data()},
-      stats{tree->GetStats().data()},
+      nodes{tree->GetNodes(ctx->Device()).data()},
+      stats{tree->GetStats(ctx->Device()).data()},
       n{tree->NumNodes()} {
   CHECK(!tree->IsMultiTarget());
 }
diff --git a/src/tree/tree_view.h b/src/tree/tree_view.h
index 55357aff2d50..19fafaabb701 100644
--- a/src/tree/tree_view.h
+++ b/src/tree/tree_view.h
@@ -74,20 +74,22 @@ struct CategoriesMixIn {
   RegTree::CategoricalSplitMatrix cats;
 
   [[nodiscard]] XGBOOST_DEVICE bool HasCategoricalSplit() const { return !cats.categories.empty(); }
-  [[nodiscard]] XGBOOST_DEVICE RegTree::CategoricalSplitMatrix GetCategoriesMatrix() const {
+  [[nodiscard]] XGBOOST_DEVICE RegTree::CategoricalSplitMatrix const& GetCategoriesMatrix() const {
     return cats;
   }
   /**
    * @brief Get the bit storage of categories used by a node.
    */
-  [[nodiscard]] common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
+  [[nodiscard]] XGBOOST_DEVICE common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
     auto node_ptr = this->GetCategoriesMatrix().node_ptr;
     auto categories = this->GetCategoriesMatrix().categories;
     auto segment = node_ptr[nidx];
     auto node_cats = categories.subspan(segment.beg, segment.size);
     return node_cats;
   }
-  [[nodiscard]] FeatureType SplitType(bst_node_t nidx) const { return cats.split_type[nidx]; }
+  [[nodiscard]] XGBOOST_DEVICE FeatureType SplitType(bst_node_t nidx) const {
+    return cats.split_type[nidx];
+  }
 };
 
 /**
@@ -142,20 +144,20 @@ struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView>, public CategoriesM
   }
 
   [[nodiscard]] RTreeNodeStat const& Stat(bst_node_t nidx) const { return stats[nidx]; }
-  [[nodiscard]] auto SumHess(bst_node_t nidx) const { return stats[nidx].sum_hess; }
-  [[nodiscard]] auto LossChg(bst_node_t nidx) const { return stats[nidx].loss_chg; }
+  [[nodiscard]] XGBOOST_DEVICE auto SumHess(bst_node_t nidx) const { return stats[nidx].sum_hess; }
+  [[nodiscard]] XGBOOST_DEVICE auto LossChg(bst_node_t nidx) const { return stats[nidx].loss_chg; }
 
   XGBOOST_DEVICE explicit ScalarTreeView(RegTree::Node const* nodes, RTreeNodeStat const* stats,
                                          RegTree::CategoricalSplitMatrix cats, bst_node_t n_nodes)
       : CategoriesMixIn{std::move(cats)}, nodes{nodes}, stats{stats}, n{n_nodes} {}
 
-  /** @brief Create a device view, not implemented yet. */
+  /** @brief Create a device view */
   explicit ScalarTreeView(Context const* ctx, RegTree const* tree);
   /** @brief Create a host view */
   explicit ScalarTreeView(RegTree const* tree)
       : CategoriesMixIn{tree->GetCategoriesMatrix(DeviceOrd::CPU())},
-        nodes{tree->GetNodes().data()},
-        stats{tree->GetStats().data()},
+        nodes{tree->GetNodes(DeviceOrd::CPU()).data()},
+        stats{tree->GetStats(DeviceOrd::CPU()).data()},
         n{tree->NumNodes()} {
     CHECK(!tree->IsMultiTarget());
   }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 1af061738211..883b4b6cf78c 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -669,7 +669,8 @@ struct GPUHistMakerDevice {
     // Use the nodes from tree, the leaf value might be changed by the objective since the
     // last update tree call.
     dh::CachingDeviceUVector<RegTree::Node> nodes;
-    dh::CopyTo(p_tree->GetNodes(), &nodes, this->ctx_->CUDACtx()->Stream());
+    // We can remove the CPU copy once we refactor the GPU hist to use the device tree.
+    dh::CopyTo(p_tree->GetNodes(DeviceOrd::CPU()), &nodes, this->ctx_->CUDACtx()->Stream());
     common::Span<RegTree::Node> d_nodes = dh::ToSpan(nodes);
     CHECK_EQ(out_preds_d.Shape(1), 1);
     dh::LaunchN(d_position.size(), ctx_->CUDACtx()->Stream(),
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 545612ffdc8a..c5b9e0153135 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -445,6 +445,10 @@ class CudaArrayIterForTest : public ArrayIterForTest {
  public:
   explicit CudaArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
                                 size_t batches = Batches());
+  explicit CudaArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
+                                std::size_t n_samples, bst_feature_t n_features,
+                                std::size_t n_batches)
+      : ArrayIterForTest{ctx, data, n_samples, n_features, n_batches} {};
   int Next() override;
   ~CudaArrayIterForTest() override = default;
 };
diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc
index ca8c066d0dcc..69c5047d045c 100644
--- a/tests/cpp/plugin/test_sycl_hist_updater.cc
+++ b/tests/cpp/plugin/test_sycl_hist_updater.cc
@@ -500,7 +500,7 @@ void TestHistUpdaterExpandWithLossGuide(const xgboost::tree::TrainParam& param)
 
   updater.TestExpandWithLossGuide(gmat, p_fmat.get(), &tree, gpair);
 
-  const auto& nodes = tree.GetNodes();
+  const auto& nodes = tree.GetNodes(DeviceOrd::CPU());
   std::vector<float> ans(data.size());
   for (size_t data_idx = 0; data_idx < data.size(); ++data_idx) {
       size_t node_idx = 0;
@@ -544,7 +544,7 @@ void TestHistUpdaterExpandWithDepthWise(const xgboost::tree::TrainParam& param)
 
   updater.TestExpandWithDepthWise(gmat, p_fmat.get(), &tree, gpair);
 
-  const auto& nodes = tree.GetNodes();
+  const auto& nodes = tree.GetNodes(DeviceOrd::CPU());
   std::vector<float> ans(data.size());
   for (size_t data_idx = 0; data_idx < data.size(); ++data_idx) {
       size_t node_idx = 0;
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index c62c9cdf84a4..f4916c4efcb0 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -31,7 +31,7 @@ void CheckArrayLayout(const RegTree& tree, ArrayLayoutT buffer, int max_depth, i
   const auto& split_cond = buffer.SplitCond();
   const auto& default_left = buffer.DefaultLeft();
   const auto& nidx_in_tree = buffer.NidxInTree();
-  const auto& nodes = tree.GetNodes();
+  const auto& nodes = tree.GetNodes(DeviceOrd::CPU());
 
   if (depth == max_depth) {
     ASSERT_EQ(nidx_in_tree[nid_array - (1u << max_depth) + 1], nid);
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 164c3d8d5a06..49e5dece215b 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -771,13 +771,22 @@ void TestVectorLeafPrediction(Context const *ctx) {
   };
   auto test_inplace = [&](float expected, HostDeviceVector<float> const*p_data) {
       PredictionCacheEntry predt_cache;
-      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+      std::shared_ptr<DMatrix> p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
       predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      if (ctx->IsCUDA()) {
+        // pull data to device.
+        p_data->SetDevice(ctx->Device());
+        p_data->ConstDeviceSpan();
+      }
       auto arr = GetArrayInterface(p_data, kRows, kCols);
       std::string str;
       Json::Dump(arr, &str);
       auto proxy = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
-      dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetArray(str.data());
+      if (ctx->IsCUDA()) {
+        dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetCudaArray(str.c_str());
+      } else {
+        dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetArray(str.c_str());
+      }
       predictor->InplacePredict(proxy, model, std::numeric_limits<float>::quiet_NaN(), &predt_cache,
                                 0, 1);
       auto const &h_predt = predt_cache.predictions.HostVector();
@@ -797,11 +806,18 @@ void TestVectorLeafPrediction(Context const *ctx) {
 
     predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
 
-    auto iter = NumpyArrayIterForTest{ctx, *p_data, kRows, static_cast<bst_feature_t>(kCols),
-                                      static_cast<std::size_t>(1)};
+    std::unique_ptr<ArrayIterForTest> iter;
+    if (ctx->IsCUDA()) {
+      iter.reset(new CudaArrayIterForTest{ctx, *p_data, kRows, static_cast<bst_feature_t>(kCols),
+                                          static_cast<std::size_t>(1)});
+    } else {
+      iter.reset(new NumpyArrayIterForTest{ctx, *p_data, kRows, static_cast<bst_feature_t>(kCols),
+                                           static_cast<std::size_t>(1)});
+    }
+
     p_fmat = std::make_shared<data::IterativeDMatrix>(
-        &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, 256,
-        std::numeric_limits<std::int64_t>::max());
+        iter.get(), iter->Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0,
+        256, std::numeric_limits<std::int64_t>::max());
 
     predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
     predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
@@ -817,18 +833,14 @@ void TestVectorLeafPrediction(Context const *ctx) {
   auto mt_tree = model.trees.front()->HostMtView();
   HostDeviceVector<float> data(kRows * kCols, mt_tree.SplitCond(RegTree::kRoot) + 1.0);
   test_batch(2.5, &data);
-  if (!ctx->IsCUDA()) {
-    test_inplace(2.5, &data);
-    test_ghist(2.5, &data);
-  }
+  test_inplace(2.5, &data);
+  test_ghist(2.5, &data);
 
   // go to left
   data.HostVector().assign(data.Size(), mt_tree.SplitCond(RegTree::kRoot) - 1.0);
   test_batch(1.5, &data);
-  if (!ctx->IsCUDA()) {
-    test_inplace(1.5, &data);
-    test_ghist(1.5, &data);
-  }
+  test_inplace(1.5, &data);
+  test_ghist(1.5, &data);
 }
 
 void ShapExternalMemoryTest::Run(Context const *ctx, bool is_qdm, bool is_interaction) {
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 22137495e8b4..0d8e05e8f183 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -54,10 +54,10 @@ TEST(Tree, AllocateNode) {
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 
-  auto& nodes = tree.GetNodes();
-  ASSERT_FALSE(nodes.at(1).IsDeleted());
-  ASSERT_TRUE(nodes.at(1).IsLeaf());
-  ASSERT_TRUE(nodes.at(2).IsLeaf());
+  auto nodes = tree.GetNodes(DeviceOrd::CPU());
+  ASSERT_FALSE(nodes[1].IsDeleted());
+  ASSERT_TRUE(nodes[1].IsLeaf());
+  ASSERT_TRUE(nodes[2].IsLeaf());
 }
 
 TEST(Tree, ExpandCategoricalFeature) {
@@ -66,7 +66,7 @@ TEST(Tree, ExpandCategoricalFeature) {
     RegTree tree;
     tree.ExpandCategorical(0, 0, {}, true, 1.0, 2.0, 3.0, 11.0, 2.0,
                            /*left_sum=*/3.0, /*right_sum=*/4.0);
-    ASSERT_EQ(tree.GetNodes().size(), 3ul);
+    ASSERT_EQ(tree.Size(), 3ul);
     ASSERT_EQ(tree.GetNumLeaves(), 2);
     ASSERT_EQ(tree.GetSplitTypes(ctx.Device()).size(), 3ul);
     ASSERT_EQ(tree.GetSplitTypes(ctx.Device())[0], FeatureType::kCategorical);

From 5f84265d2ec47ceaa38f8e05608d37c1a3f88246 Mon Sep 17 00:00:00 2001
From: Jens Goossens <82045265+jensgoossens-tomtom@users.noreply.github.com>
Date: Sun, 19 Oct 2025 20:04:52 +0200
Subject: [PATCH 199/224] [jvm-packages] Use `inferBatchSizeParameter` instead
 of a hardcoded value. (#11745)

---
 .../scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
index 2e09d18869d5..0e95ed352ecf 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
@@ -630,8 +630,7 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML
 
     // Broadcast the booster to each executor.
     val bBooster = input.sparkSession.sparkContext.broadcast(nativeBooster)
-    // TODO configurable
-    val inferBatchSize = 32 << 10
+    val inferBatchSize = getInferBatchSize
     val missing = getMissing
 
     // Here, we use RDD instead of DF to avoid different encoders for different

From 4b2700211148704fe4859e229132ec5c25ca8905 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 21 Oct 2025 02:01:17 +0800
Subject: [PATCH 200/224] Fix stub function names. (#11761)

---
 src/c_api/c_api.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 75794e49d0c1..fb7e06eaf505 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1468,7 +1468,7 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
 }
 
 #if !defined(XGBOOST_USE_CUDA)
-XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, char const *,
+XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *, char const *,
                                           DMatrixHandle, xgboost::bst_ulong const **,
                                           xgboost::bst_ulong *, const float **) {
   API_BEGIN();
@@ -1477,7 +1477,7 @@ XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, ch
   API_END();
 }
 
-XGB_DLL int XGBoosterPredictFromCUDAColumnar(BoosterHandle handle, char const *, char const *,
+XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *, char const *,
                                              DMatrixHandle, xgboost::bst_ulong const **,
                                              xgboost::bst_ulong *, const float **) {
   API_BEGIN();

From a340adcde029788a09f053f28ffaacfa82915d98 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 21 Oct 2025 07:47:13 +0800
Subject: [PATCH 201/224] Check for removed binary format. (#11760)

---
 src/c_api/c_api.cc      |  9 +++++++--
 src/common/error_msg.cc | 21 +++++++++++++++++++++
 src/common/error_msg.h  |  4 ++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index fb7e06eaf505..074d1ab76e4f 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1525,8 +1525,13 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char *fname) {
   xgboost_CHECK_C_ARG_PTR(fname);
   auto read_file = [&]() {
     auto str = common::LoadSequentialFile(fname);
-    CHECK_GE(str.size(), 2);  // "{}"
-    CHECK_EQ(str[0], '{');
+    // "{}"
+    CHECK_GE(str.size(), 2) << error::InvalidModel(fname);
+    // The old binary format has the starting bytes "binf".
+    if (str.size() >= 4 && StringView{str.data(), 4} == "binf") {  // NOLINT
+      LOG(FATAL) << error::OldBinaryModel(fname);
+    }
+    CHECK_EQ(str[0], '{') << error::InvalidModel(fname);
     return str;
   };
   auto ext = common::FileExtension(fname);
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index c2a2985cf7df..0eb215f95f40 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -19,6 +19,27 @@ namespace xgboost::error {
   return ss.str();
 }
 
+[[nodiscard]] std::string InvalidModel(StringView fname) {
+  std::stringstream ss;
+  ss << "Invalid model format in: `" << fname << "`.";
+  return ss.str();
+}
+
+[[nodiscard]] std::string OldBinaryModel(StringView fname) {
+  std::stringstream ss;
+  ss << "Failed to load model: `" << fname << "`. ";
+  ss << R"doc(
+The binary format has been deprecated in 1.6 and removed in 3.1, use UBJ or JSON
+instead. You can port the binary model to UBJ and JSON by re-saving it with XGBoost
+3.0. See:
+
+    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
+
+for more info.
+)doc";
+  return ss.str();
+}
+
 void WarnManualUpdater() {
   static std::once_flag flag;
   std::call_once(flag, [] {
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 5f6ae37cae2a..7960fe2ec1ee 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -84,6 +84,10 @@ inline void WarnOldSerialization() {
   logged = true;
 }
 
+[[nodiscard]] std::string InvalidModel(StringView fname);
+
+[[nodiscard]] std::string OldBinaryModel(StringView fname);
+
 void WarnManualUpdater();
 
 void WarnEmptyDataset();

From c2d06bd19aa6ef339a70c372ee709483572bdd81 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 21 Oct 2025 09:31:05 +0800
Subject: [PATCH 202/224] Fix shap with vector intercept. (#11764)

---
 src/predictor/cpu_predictor.cc          | 24 +++++++--------
 src/predictor/gpu_predictor.cu          | 26 ++++++++--------
 tests/cpp/predictor/test_predictor.cc   |  6 ++--
 tests/python-gpu/test_gpu_prediction.py | 41 +++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index f86224a11fcb..9f4489563225 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -257,14 +257,14 @@ struct DataToFeatVec {
 
 template <typename EncAccessor>
 class SparsePageView : public DataToFeatVec<SparsePageView<EncAccessor>> {
-  EncAccessor const &acc_;
+  EncAccessor acc_;
   HostSparsePageView const view_;
 
  public:
   bst_idx_t const base_rowid;
 
-  SparsePageView(HostSparsePageView const p, bst_idx_t base_rowid, EncAccessor const &acc)
-      : acc_{acc}, view_{p}, base_rowid{base_rowid} {}
+  SparsePageView(HostSparsePageView const p, bst_idx_t base_rowid, EncAccessor acc)
+      : acc_{std::move(acc)}, view_{p}, base_rowid{base_rowid} {}
   [[nodiscard]] std::size_t Size() const { return view_.Size(); }
 
   [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float *out) const {
@@ -283,7 +283,7 @@ template <typename EncAccessor>
 class GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView<EncAccessor>> {
  private:
   GHistIndexMatrix const &page_;
-  EncAccessor const &acc_;
+  EncAccessor acc_;
   common::Span<FeatureType const> ft_;
 
   std::vector<std::uint32_t> const &ptrs_;
@@ -295,10 +295,10 @@ class GHistIndexMatrixView : public DataToFeatVec<GHistIndexMatrixView<EncAccess
   bst_idx_t const base_rowid;
 
  public:
-  GHistIndexMatrixView(GHistIndexMatrix const &_page, EncAccessor const &acc,
+  GHistIndexMatrixView(GHistIndexMatrix const &_page, EncAccessor acc,
                        common::Span<FeatureType const> ft)
       : page_{_page},
-        acc_{acc},
+        acc_{std::move(acc)},
         ft_{ft},
         ptrs_{_page.cut.Ptrs()},
         mins_{_page.cut.MinValues()},
@@ -365,11 +365,11 @@ template <typename Adapter, typename EncAccessor>
 class AdapterView : public DataToFeatVec<AdapterView<Adapter, EncAccessor>> {
   Adapter const *adapter_;
   float missing_;
-  EncAccessor const &acc_;
+  EncAccessor acc_;
 
  public:
-  explicit AdapterView(Adapter const *adapter, float missing, EncAccessor const &acc)
-      : adapter_{adapter}, missing_{missing}, acc_{acc} {}
+  explicit AdapterView(Adapter const *adapter, float missing, EncAccessor acc)
+      : adapter_{adapter}, missing_{missing}, acc_{std::move(acc)} {}
 
   [[nodiscard]] bst_idx_t DoFill(bst_idx_t ridx, float *out) const {
     auto const &batch = adapter_->Value();
@@ -408,7 +408,7 @@ struct EncAccessorPolicy {
   [[nodiscard]] auto MakeAccessor(Context const *ctx, enc::HostColumnsView new_enc,
                                   gbm::GBTreeModel const &model) {
     auto [acc, mapping] = MakeCatAccessor(ctx, new_enc, model.Cats());
-    this->mapping_ = std::move(mapping);
+    std::swap(mapping, this->mapping_);
     return acc;
   }
 };
@@ -923,7 +923,7 @@ class CPUPredictor : public Predictor {
     CHECK_NE(ncolumns, 0);
     auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
     auto base_margin = info.base_margin_.View(device);
-    auto base_score = model.learner_model_param->BaseScore(device)(0);
+    auto base_score = model.learner_model_param->BaseScore(device);
 
     // parallel over local batch
     common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
@@ -962,7 +962,7 @@ class CPUPredictor : public Predictor {
           CHECK_EQ(base_margin.Shape(1), ngroup);
           p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
         } else {
-          p_contribs[ncolumns - 1] += base_score;
+          p_contribs[ncolumns - 1] += base_score(gid);
         }
       }
     });
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index d35325413e59..88ca5c51ad61 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -1129,6 +1129,7 @@ class GPUPredictor : public xgboost::Predictor {
     // allocate space for (number of features + bias) times the number of rows
     size_t contributions_columns = model.learner_model_param->num_feature + 1;  // +1 for bias
     auto dim_size = contributions_columns * model.learner_model_param->num_output_group;
+    // Output shape: [n_samples, n_classes, n_features + 1]
     out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
     out_contribs->Fill(0.0f);
     auto phis = out_contribs->DeviceSpan();
@@ -1159,11 +1160,11 @@ class GPUPredictor : public xgboost::Predictor {
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
-    dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
-                ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
-                  phis[(idx + 1) * contributions_columns - 1] +=
-                      margin.empty() ? base_score(0) : margin[idx];
-                });
+    bst_idx_t n_samples = p_fmat->Info().num_row_;
+    dh::LaunchN(n_samples * ngroup, ctx_->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
+      auto [_, gid] = linalg::UnravelIndex(idx, n_samples, ngroup);
+      phis[(idx + 1) * contributions_columns - 1] += margin.empty() ? base_score(gid) : margin[idx];
+    });
   }
 
   void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
@@ -1219,14 +1220,13 @@ class GPUPredictor : public xgboost::Predictor {
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
     size_t n_features = model.learner_model_param->num_feature;
-    dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
-                ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
-                  size_t group = idx % ngroup;
-                  size_t row_idx = idx / ngroup;
-                  phis[gpu_treeshap::IndexPhiInteractions(row_idx, ngroup, group, n_features,
-                                                          n_features, n_features)] +=
-                      margin.empty() ? base_score(0) : margin[idx];
-                });
+    bst_idx_t n_samples = p_fmat->Info().num_row_;
+    dh::LaunchN(n_samples * ngroup, ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
+      auto [ridx, gidx] = linalg::UnravelIndex(idx, n_samples, ngroup);
+      phis[gpu_treeshap::IndexPhiInteractions(ridx, ngroup, gidx, n_features, n_features,
+                                              n_features)] +=
+          margin.empty() ? base_score(gidx) : margin[idx];
+    });
   }
 
   void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<float>* predictions,
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 49e5dece215b..a03815da0eb0 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -860,7 +860,7 @@ void ShapExternalMemoryTest::Run(Context const *ctx, bool is_qdm, bool is_intera
                                  .Classes(n_classes));
   std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
   learner->SetParam("device", ctx->DeviceName());
-  learner->SetParam("base_score", "0.5");
+  learner->SetParam("base_score", "[0.5, 0.5, 0.5]");
   learner->SetParam("num_parallel_tree", "3");
   learner->SetParam("max_bin", std::to_string(max_bin));
   for (std::int32_t i = 0; i < 4; ++i) {
@@ -869,8 +869,10 @@ void ShapExternalMemoryTest::Run(Context const *ctx, bool is_qdm, bool is_intera
   Json model{Object{}};
   learner->SaveModel(&model);
   auto j_booster = model["learner"]["gradient_booster"]["model"];
-  auto model_param = MakeMP(n_features, 0.0, n_classes, ctx->Device());
 
+  auto base_score = linalg::Tensor<float, 1>{{0.0, 0.0, 0.0}, {3}, ctx->Device()};
+  LearnerModelParam model_param(n_features, std::move(base_score), n_classes, 1,
+                                MultiStrategy::kOneOutputPerTree);
   gbm::GBTreeModel gbtree{&model_param, ctx};
   gbtree.LoadModel(j_booster);
 
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index eb51f9ad7a60..cd76daa83bc2 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -628,3 +628,44 @@ def test_dtypes(self):
 
 def test_base_margin_vs_base_score() -> None:
     run_base_margin_vs_base_score("cuda")
+
+
+@pytest.mark.skipif(**tm.no_sklearn())
+def test_shap_multiclass() -> None:
+    from sklearn.datasets import make_classification
+
+    X, y = make_classification(n_classes=3, random_state=2025, n_informative=16)
+    param = {
+        "tree_method": "hist",
+        "device": "cuda",
+        "num_class": 3,
+        "base_score": [1.0, 2.0, 3.0],
+    }
+    Xy = xgb.DMatrix(X, y)
+    bst = xgb.train(param, Xy, 8)
+
+    d_shap = bst.predict(Xy, pred_contribs=True)
+    d_margin = bst.predict(Xy, output_margin=True)
+
+    bst.set_param({"device": "cpu"})
+
+    h_shap = bst.predict(Xy, pred_contribs=True)
+    h_margin = bst.predict(Xy, output_margin=True)
+
+    np.testing.assert_allclose(d_shap, h_shap, atol=1e-6)
+    np.testing.assert_allclose(d_margin, h_margin, atol=1e-6)
+
+    # Compare base margin and base score
+    margin = np.stack(
+        [
+            np.ones(X.shape[0]),
+            np.full(X.shape[0], fill_value=2.0),
+            np.full(X.shape[0], fill_value=3.0),
+        ],
+        axis=1,
+    )
+    Xy = xgb.DMatrix(X, y, base_margin=margin)
+
+    bst.set_param({"device": "cuda"})
+    d_shap = bst.predict(Xy, pred_contribs=True)
+    np.testing.assert_allclose(d_shap, h_shap, atol=1e-6)

From b3ed14d601bb13614a3e85b4133d1f7fb99a9a22 Mon Sep 17 00:00:00 2001
From: siqi-he <58437815+siqi-he@users.noreply.github.com>
Date: Tue, 21 Oct 2025 05:22:52 -0500
Subject: [PATCH 203/224] [mt] Multi target memory reallocation optimization
 (#11744)

---
 src/tree/updater_quantile_hist.cc    | 15 ++++-
 tests/cpp/tree/test_quantile_hist.cc | 97 +++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 0d77092fd96f..25b54a37e7de 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -26,8 +26,8 @@
 #include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
 #include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
 #include "hist/hist_cache.h"                 // for BoundedHistCollection
-#include "hist/histogram.h"                  // for MultiHistogramBuilder
 #include "hist/hist_param.h"                 // for HistMakerTrainParam
+#include "hist/histogram.h"                  // for MultiHistogramBuilder
 #include "hist/sampler.h"                    // for SampleGradient
 #include "param.h"                           // for TrainParam, GradStats
 #include "xgboost/base.h"                    // for Args, GradientPairPrecise, GradientPair, Gra...
@@ -152,15 +152,23 @@ class MultiTargetHistBuilder {
 
     p_last_fmat_ = p_fmat;
     bst_bin_t n_total_bins = 0;
-    partitioner_.clear();
+    size_t page_idx = 0;
     for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
       } else {
         CHECK_EQ(n_total_bins, page.cut.TotalBins());
       }
-      partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->Info().IsColumnSplit());
+      if (page_idx < partitioner_.size()) {
+        partitioner_[page_idx].Reset(ctx_, page.Size(), page.base_rowid,
+                                     p_fmat->Info().IsColumnSplit());
+      } else {
+        partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid,
+                                  p_fmat->Info().IsColumnSplit());
+      }
+      page_idx++;
     }
+    partitioner_.resize(page_idx);
 
     bst_target_t n_targets = p_tree->NumTargets();
     histogram_builder_ = std::make_unique<MultiHistogramBuilder>();
@@ -364,6 +372,7 @@ class HistUpdater {
       }
       page_idx++;
     }
+    partitioner_.resize(page_idx);
     histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(),
                               fmat->Info().IsColumnSplit(), hist_param_);
     evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 362c6c782e09..d8e1e2c016ee 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -3,19 +3,24 @@
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
+#include <xgboost/linalg.h>
 #include <xgboost/tree_updater.h>
 
+#include <cmath>
 #include <cstddef>  // for size_t
+#include <cstring>
+#include <memory>
 #include <string>
 #include <vector>
 
 #include "../../../src/tree/common_row_partitioner.h"
 #include "../../../src/tree/hist/expand_entry.h"  // for MultiExpandEntry, CPUExpandEntry
-#include "../collective/test_worker.h"  // for TestDistributedGlobal
+#include "../collective/test_worker.h"            // for TestDistributedGlobal
 #include "../helpers.h"
 #include "test_column_split.h"  // for TestColumnSplit
 #include "test_partitioner.h"
 #include "xgboost/data.h"
+#include "xgboost/task.h"
 
 namespace xgboost::tree {
 namespace {
@@ -246,4 +251,94 @@ INSTANTIATE_TEST_SUITE_P(ColumnSplit, TestHistColumnSplit, ::testing::ValuesIn([
                            }
                            return params;
                          }()));
+
+namespace {
+void FillGradients(linalg::Matrix<GradientPair>* gpair) {
+  auto h = gpair->HostView();
+  for (std::size_t row = 0; row < h.Shape(0); ++row) {
+    for (std::size_t target = 0; target < h.Shape(1); ++target) {
+      h(row, target) = GradientPair{1.0f, 0.0f};
+    }
+  }
+}
+
+// Verify partitioner doesn't write past buffer end when doing
+// update on small dataset after large one.
+void TestPartitionerOverrun(bst_target_t n_targets) {
+  constexpr bst_idx_t kNBig = 1 << 16, kNSmall = 1024;
+  constexpr int kCols = 3;
+
+  Context ctx;
+  ctx.InitAllowUnknown(Args{{"nthread", "1"}});
+
+  ObjInfo task{ObjInfo::kRegression, true, true};
+  auto updater =
+      std::unique_ptr<TreeUpdater>{TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
+
+  TrainParam param;
+  param.InitAllowUnknown(Args{{"max_depth", "1"},
+                              {"max_bin", "32"},
+                              {"lambda", "0"},
+                              {"gamma", "0"},
+                              {"min_child_weight", "0"}});
+  updater->Configure(Args{});
+
+  auto const n_targets_size = static_cast<std::size_t>(n_targets);
+
+  auto dmat_large =
+      RandomDataGenerator{kNBig, kCols, 0.0f}.Seed(0).Batches(8).GenerateSparsePageDMatrix(
+          "part_resize_big_first", true);
+
+  std::size_t shape_large[2]{dmat_large->Info().num_row_, n_targets_size};
+  linalg::Matrix<GradientPair> gpair_large(shape_large, ctx.Device());
+  FillGradients(&gpair_large);
+
+  RegTree tree_large{n_targets, static_cast<bst_feature_t>(kCols)};
+  std::vector<RegTree*> trees_large{&tree_large};
+  std::vector<HostDeviceVector<bst_node_t>> position_large(1);
+  common::Span<HostDeviceVector<bst_node_t>> pos_large{position_large.data(), 1};
+  updater->Update(&param, &gpair_large, dmat_large.get(), pos_large, trees_large);
+
+  auto dmat_small =
+      RandomDataGenerator{kNSmall, kCols, 0.0f}.Seed(1).Batches(1).GenerateSparsePageDMatrix(
+          "part_resize_small_second", false);
+
+  std::vector<HostDeviceVector<bst_node_t>> position_small(1);
+  auto& pos = position_small.front();
+  pos.Resize(kNBig);    // Allocate large
+  pos.Resize(kNSmall);  // Shrink logical size, capacity remains large
+
+  auto& hv = pos.HostVector();
+  std::size_t cap = hv.capacity();
+  ASSERT_GE(cap, static_cast<std::size_t>(kNBig));
+
+  std::size_t tail_elems = cap - hv.size();
+  ASSERT_GT(tail_elems, 0u) << "Expected reserved tail storage";
+  std::vector<bst_node_t> tail_before(tail_elems);
+  std::memcpy(tail_before.data(), hv.data() + hv.size(), tail_elems * sizeof(bst_node_t));
+
+  std::size_t shape_small[2]{dmat_small->Info().num_row_, n_targets_size};
+  linalg::Matrix<GradientPair> gpair_small(shape_small, ctx.Device());
+  FillGradients(&gpair_small);
+
+  RegTree tree_small{n_targets, static_cast<bst_feature_t>(kCols)};
+  std::vector<RegTree*> trees_small{&tree_small};
+  common::Span<HostDeviceVector<bst_node_t>> pos_small{position_small.data(), 1};
+  updater->Update(&param, &gpair_small, dmat_small.get(), pos_small, trees_small);
+
+  // Verify no buffer overrun: tail bytes should be unchanged
+  ASSERT_EQ(hv.capacity(), cap) << "Test precondition violated: capacity changed";
+  std::vector<bst_node_t> tail_after(tail_elems);
+  std::memcpy(tail_after.data(), hv.data() + hv.size(), tail_elems * sizeof(bst_node_t));
+
+  EXPECT_EQ(tail_before, tail_after)
+      << "Buffer overrun detected: writes past kNSmall when updating small "
+         "single-batch DMatrix after large multi-batch one. "
+         "Likely stale partitioner writing to buffer.";
+}
+}  // anonymous namespace
+
+TEST(QuantileHist, HistUpdaterPartitionerOverrun) { TestPartitionerOverrun(1); }
+
+TEST(QuantileHist, MultiTargetHistBuilderPartitionerOverrun) { TestPartitionerOverrun(3); }
 }  // namespace xgboost::tree

From 3868b5fb1ce14bcdf2c11c5ac6c63de600041777 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 21 Oct 2025 19:30:47 +0800
Subject: [PATCH 204/224] Share the model view for CPU and GPU. (#11759)

- Extract the device model view and share it with the CPU.
- Ensure the model is locked when pulling tree data.
- Use the host device vector to store the tree group.
---
 plugin/sycl/predictor/predictor.cc        |   7 +-
 src/common/device_vector.cuh              |   4 +-
 src/data/cat_container.h                  |   3 +-
 src/gbm/gbtree.cc                         |  30 ++-
 src/gbm/gbtree_model.cc                   |  24 ++-
 src/gbm/gbtree_model.h                    |  47 ++---
 src/predictor/cpu_predictor.cc            | 245 +++++++++++-----------
 src/predictor/gbtree_view.h               |  83 ++++++++
 src/predictor/gpu_predictor.cu            |  80 +++----
 src/tree/tree_view.cc                     |  32 +--
 src/tree/tree_view.h                      |   5 +-
 tests/cpp/gbm/test_gbtree.cc              |   1 +
 tests/cpp/predictor/test_cpu_predictor.cc |   2 +-
 tests/cpp/predictor/test_gpu_predictor.cu |  15 +-
 tests/cpp/predictor/test_predictor.cc     |   6 +-
 tests/cpp/predictor/test_predictor.h      |  11 +-
 16 files changed, 350 insertions(+), 245 deletions(-)
 create mode 100644 src/predictor/gbtree_view.h

diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index 9147aa0f450c..1b4f8f9ee2d6 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -90,10 +90,11 @@ class DeviceModel {
 
     int num_group = model.learner_model_param->num_output_group;
     if (num_group > 1) {
-      tree_group.Resize(model.tree_info.size());
+      tree_group.Resize(model.tree_info.Size());
       auto& tree_group_host = tree_group.HostVector();
-      for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
-        tree_group_host[tree_idx] = model.tree_info[tree_idx];
+      auto const& tree_group_in = model.tree_info.ConstHostVector();
+      for (size_t tree_idx = 0; tree_idx < tree_group_in.size(); tree_idx++)
+        tree_group_host[tree_idx] = tree_group_in[tree_idx];
     }
   }
 };
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 7d3eef35080a..0d3fa4b93678 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -467,7 +467,9 @@ class DeviceUVectorImpl {
     this->size_ = n;
     this->capacity_ = n;
 
-    std::swap(this->data_, new_ptr);
+    this->data_ = std::move(new_ptr);
+    // swap failed with CTK12.8
+    // std::swap(this->data_, new_ptr);
   }
   // Resize with init
   void resize(std::size_t n, T const &v) {  // NOLINT
diff --git a/src/data/cat_container.h b/src/data/cat_container.h
index 66e4c65bf5ce..4ad989b16ce4 100644
--- a/src/data/cat_container.h
+++ b/src/data/cat_container.h
@@ -18,9 +18,10 @@
 #include "xgboost/base.h"                // for bst_cat_t
 #include "xgboost/data.h"                // for Entry
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
-#include "xgboost/json.h"                // for Json
 
 namespace xgboost {
+class Json;
+
 /**
  * @brief Error policy class used to interface with the encoder implementaion.
  */
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index a92dd42906d9..f8e4ad7caa02 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -145,6 +145,21 @@ void GBTree::Configure(Args const& cfg) {
   }
 }
 
+void GBTreeModel::InitTreesToUpdate() {
+  if (trees_to_update.empty()) {
+    for (auto& tree : trees) {
+      trees_to_update.push_back(std::move(tree));
+    }
+
+    trees.clear();
+    param.num_trees = 0;
+    tree_info.HostVector().clear();
+
+    iteration_indptr.clear();
+    iteration_indptr.push_back(0);
+  }
+}
+
 void GPUCopyGradient(Context const*, linalg::Matrix<GradientPair> const*, bst_group_t,
                      linalg::Matrix<GradientPair>*)
 #if defined(XGBOOST_USE_CUDA)
@@ -444,7 +459,9 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
 
   auto& out_indptr = out_model.iteration_indptr;
   TreesOneGroup& out_trees = out_model.trees;
-  std::vector<int32_t>& out_trees_info = out_model.tree_info;
+  auto& out_tree_info = out_model.tree_info.HostVector();
+
+  auto const& in_tree_info = this->model_.tree_info.ConstHostVector();
 
   bst_layer_t n_layers = (end - begin) / step;
   out_indptr.resize(n_layers + 1, 0);
@@ -462,8 +479,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
         std::unique_ptr<RegTree> new_tree{this->model_.trees.at(in_tree_idx)->Copy()};
         out_trees.emplace_back(std::move(new_tree));
 
-        bst_group_t group = this->model_.tree_info[in_tree_idx];
-        out_trees_info.push_back(group);
+        bst_group_t group = in_tree_info[in_tree_idx];
+        out_tree_info.push_back(group);
 
         out_model.iteration_indptr[out_l + 1]++;
       });
@@ -735,7 +752,7 @@ class Dart : public GBTree {
     auto layer_trees = [&]() {
       return model_.param.num_parallel_tree * model_.learner_model_param->OutputLength();
     };
-
+    auto const& h_tree_info = this->model_.tree_info.ConstHostVector();
     for (bst_tree_t i = tree_begin; i < tree_end; i += 1) {
       if (training && std::binary_search(idx_drop_.cbegin(), idx_drop_.cend(), i)) {
         continue;
@@ -749,7 +766,7 @@ class Dart : public GBTree {
 
       // Multiple the weight to output prediction.
       auto w = this->weight_drop_.at(i);
-      auto group = model_.tree_info.at(i);
+      auto group = h_tree_info.at(i);
       CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
@@ -815,6 +832,7 @@ class Dart : public GBTree {
       CHECK(success) << msg;
     };
 
+    auto const& h_tree_info = this->model_.tree_info.ConstHostVector();
     // Inplace predict is not used for training, so no need to drop tree.
     for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
       predict_impl(i);
@@ -837,7 +855,7 @@ class Dart : public GBTree {
       }
       // Multiple the tree weight
       auto w = this->weight_drop_.at(i);
-      auto group = model_.tree_info.at(i);
+      auto group = h_tree_info.at(i);
       CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 2e3aa7f40af4..ecb6a66e9c17 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -19,7 +19,7 @@ namespace xgboost::gbm {
 namespace {
 // For creating the tree indptr from old models.
 void MakeIndptr(GBTreeModel* out_model) {
-  auto const& tree_info = out_model->tree_info;
+  auto const& tree_info = out_model->tree_info.ConstHostVector();
   if (tree_info.empty()) {
     return;
   }
@@ -41,7 +41,7 @@ void MakeIndptr(GBTreeModel* out_model) {
 // Validate the consistency of the model.
 void Validate(GBTreeModel const& model) {
   CHECK_EQ(model.trees.size(), model.param.num_trees);
-  CHECK_EQ(model.tree_info.size(), model.param.num_trees);
+  CHECK_EQ(model.tree_info.Size(), model.param.num_trees);
   // True even if the model is empty since we should always have 0 as the first element.
   CHECK_EQ(model.iteration_indptr.back(), model.param.num_trees);
 }
@@ -61,9 +61,10 @@ void GBTreeModel::SaveModel(Json* p_out) const {
     trees_json[t] = std::move(jtree);
   });
 
-  std::vector<Json> tree_info_json(tree_info.size());
-  for (size_t i = 0; i < tree_info.size(); ++i) {
-    tree_info_json[i] = Integer(tree_info[i]);
+  auto const& h_tree_info = tree_info.ConstHostVector();
+  std::vector<Json> tree_info_json(tree_info.Size());
+  for (size_t i = 0; i < h_tree_info.size(); ++i) {
+    tree_info_json[i] = Integer(h_tree_info[i]);
   }
 
   out["trees"] = Array(std::move(trees_json));
@@ -91,7 +92,8 @@ void GBTreeModel::LoadModel(Json const& in) {
 
   auto const& tree_info_json = get<Array const>(jmodel.at("tree_info"));
   CHECK_EQ(tree_info_json.size(), param.num_trees);
-  tree_info.resize(param.num_trees);
+  auto& h_tree_info = this->tree_info.HostVector();
+  h_tree_info.resize(param.num_trees);
 
   common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
     auto tree_id = get<Integer const>(trees_json[t]["id"]);
@@ -100,7 +102,7 @@ void GBTreeModel::LoadModel(Json const& in) {
   });
 
   for (bst_tree_t i = 0; i < param.num_trees; ++i) {
-    tree_info[i] = get<Integer const>(tree_info_json[i]);
+    h_tree_info[i] = get<Integer const>(tree_info_json[i]);
   }
 
   auto indptr_it = jmodel.find("iteration_indptr");
@@ -144,10 +146,16 @@ bst_tree_t GBTreeModel::CommitModel(TreesOneIter&& new_trees) {
 }
 
 void GBTreeModel::CommitModelGroup(TreesOneGroup&& new_trees, bst_target_t group_idx) {
+  auto& h_tree_info = this->tree_info.HostVector();
   for (auto& new_tree : new_trees) {
     trees.push_back(std::move(new_tree));
-    tree_info.push_back(group_idx);
+    h_tree_info.push_back(group_idx);
   }
   param.num_trees += static_cast<int>(new_trees.size());
 }
+
+common::Span<bst_target_t const> GBTreeModel::TreeGroups(DeviceOrd device) const {
+  return device.IsCPU() ? this->tree_info.ConstHostSpan()
+                        : (this->tree_info.SetDevice(device), this->tree_info.ConstDeviceSpan());
+}
 }  // namespace xgboost::gbm
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index 3a39c6f097b9..51f0962abf37 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -6,21 +6,18 @@
 #ifndef XGBOOST_GBM_GBTREE_MODEL_H_
 #define XGBOOST_GBM_GBTREE_MODEL_H_
 
-#include <dmlc/io.h>
 #include <dmlc/parameter.h>
-#include <xgboost/context.h>
-#include <xgboost/learner.h>
-#include <xgboost/model.h>
-#include <xgboost/parameter.h>
-#include <xgboost/tree_model.h>
 
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "../common/threading_utils.h"
 #include "../data/cat_container.h"  // for CatContainer
+#include "xgboost/context.h"
+#include "xgboost/learner.h"
+#include "xgboost/model.h"
+#include "xgboost/tree_model.h"
 
 namespace xgboost {
 
@@ -28,23 +25,23 @@ class Json;
 
 namespace gbm {
 /**
- * \brief Container for all trees built (not update) for one group.
+ * @brief Container for all trees built (not update) for one group.
  */
 using TreesOneGroup = std::vector<std::unique_ptr<RegTree>>;
 /**
- * \brief Container for all trees built (not update) for one iteration.
+ * @brief Container for all trees built (not update) for one iteration.
  */
 using TreesOneIter = std::vector<TreesOneGroup>;
 
-/*! \brief model parameters */
+/** @brief GBTree model parameters. */
 struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
  public:
   /**
-   * \brief number of trees
+   * @brief The number of trees.
    */
   std::int32_t num_trees{0};
   /**
-   * \brief Number of trees for a forest.
+   * @brief Number of trees for a single forest.
    */
   std::int32_t num_parallel_tree{1};
 
@@ -73,20 +70,8 @@ struct GBTreeModel : public Model {
       param.UpdateAllowUnknown(cfg);
     }
   }
-
-  void InitTreesToUpdate() {
-    if (trees_to_update.empty()) {
-      for (auto& tree : trees) {
-        trees_to_update.push_back(std::move(tree));
-      }
-      trees.clear();
-      param.num_trees = 0;
-      tree_info.clear();
-
-      iteration_indptr.clear();
-      iteration_indptr.push_back(0);
-    }
-  }
+  /** @brief Move existing trees into the update queue. */
+  void InitTreesToUpdate();
 
   void SaveModel(Json* p_out) const override;
   void LoadModel(Json const& p_out) override;
@@ -114,9 +99,9 @@ struct GBTreeModel : public Model {
     return static_cast<std::int32_t>(iteration_indptr.size() - 1);
   }
 
-  // base margin
+  /** @brief Global model properties. */
   LearnerModelParam const* learner_model_param;
-  // model parameter
+  /** @brief GBTree model parameters. */
   GBTreeModelParam param;
   /*! \brief vector of trees stored in the model */
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -125,7 +110,7 @@ struct GBTreeModel : public Model {
   /**
    * @brief Group index for trees.
    */
-  std::vector<int> tree_info;
+  HostDeviceVector<bst_target_t> tree_info;
   /**
    * @brief Number of trees accumulated for each iteration.
    */
@@ -137,6 +122,10 @@ struct GBTreeModel : public Model {
   void Cats(std::shared_ptr<CatContainer> cats) { this->cats_ = cats; }
 
   auto const* Ctx() const { return this->ctx_; }
+  /**
+   * @brief Getter for the tree group index.
+   */
+  common::Span<bst_target_t const> TreeGroups(DeviceOrd device) const;
 
  private:
   /**
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 9f4489563225..03206abcf00f 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -20,9 +20,10 @@
 #include "../data/gradient_index.h"           // for GHistIndexMatrix
 #include "../data/proxy_dmatrix.h"            // for DMatrixProxy
 #include "../gbm/gbtree_model.h"              // for GBTreeModel, GBTreeModelParam
+#include "array_tree_layout.h"                // for ProcessArrayTree
 #include "dmlc/registry.h"                    // for DMLC_REGISTRY_FILE_TAG
+#include "gbtree_view.h"                      // for GBTreeModelView
 #include "predict_fn.h"                       // for GetNextNode, GetNextNodeMulti
-#include "array_tree_layout.h"                // for ProcessArrayTree
 #include "treeshap.h"                         // for CalculateContributions
 #include "utils.h"                            // for CheckProxyDMatrix
 #include "xgboost/base.h"                     // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
@@ -41,9 +42,23 @@ namespace xgboost::predictor {
 
 DMLC_REGISTRY_FILE_TAG(cpu_predictor);
 
-namespace scalar {
-template <bool has_missing, bool has_categorical>
-bst_node_t GetLeafIndex(tree::ScalarTreeView const &tree, const RegTree::FVec &feat,
+namespace {
+using TreeViewVar = std::variant<tree::ScalarTreeView, tree::MultiTargetTreeView>;
+struct CopyViews {
+  void operator()(std::vector<TreeViewVar> *p_dst, std::vector<TreeViewVar> &&src) const {
+    std::swap(src, *p_dst);
+  }
+};
+
+template <typename T>
+using Vec = std::vector<T, std::allocator<T>>;
+// The input device should be DeviceOrd::CPU() instead of Context::Device(). The GBTree
+// has an optimization to use CPU predictor when the DMatrix SparsePage is on CPU, even if
+// the context is a CUDA context.
+using HostModel = GBTreeModelView<Vec, TreeViewVar, CopyViews>;
+
+template <bool has_missing, bool has_categorical, typename TreeView>
+bst_node_t GetLeafIndex(TreeView const &tree, const RegTree::FVec &feat,
                         RegTree::CategoricalSplitMatrix const &cats, bst_node_t nidx) {
   while (!tree.IsLeaf(nidx)) {
     bst_feature_t split_index = tree.SplitIndex(nidx);
@@ -53,7 +68,9 @@ bst_node_t GetLeafIndex(tree::ScalarTreeView const &tree, const RegTree::FVec &f
   }
   return nidx;
 }
+}  // namespace
 
+namespace scalar {
 template <bool has_categorical>
 [[nodiscard]] float PredValueByOneTree(const RegTree::FVec &p_feats,
                                        tree::ScalarTreeView const &tree,
@@ -91,19 +108,6 @@ void PredValueByOneTree(tree::ScalarTreeView const &tree, std::size_t const pred
 }  // namespace scalar
 
 namespace multi {
-template <bool has_missing, bool has_categorical>
-bst_node_t GetLeafIndex(tree::MultiTargetTreeView const &tree, const RegTree::FVec &feat,
-                        RegTree::CategoricalSplitMatrix const &cats,
-                        bst_node_t nidx) {
-  while (!tree.IsLeaf(nidx)) {
-    bst_feature_t split_index = tree.SplitIndex(nidx);
-    auto fvalue = feat.GetFvalue(split_index);
-    nidx = GetNextNode<has_missing, has_categorical>(
-        tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
-  }
-  return nidx;
-}
-
 template <bool has_categorical>
 void PredValueByOneTree(RegTree::FVec const &p_feats, tree::MultiTargetTreeView const &tree,
                         RegTree::CategoricalSplitMatrix const &cats,
@@ -119,30 +123,28 @@ void PredValueByOneTree(RegTree::FVec const &p_feats, tree::MultiTargetTreeView
 }
 
 template <bool has_categorical, bool any_missing, bool use_array_tree_layout>
-void PredValueByOneTree(const RegTree &tree, std::size_t const predict_offset,
+void PredValueByOneTree(tree::MultiTargetTreeView const &tree, std::size_t const predict_offset,
                         common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
                         linalg::MatrixView<float> out_predt, bst_node_t *p_nidx, bst_node_t depth) {
-  const auto mt_tree = tree.HostMtView();
-  auto const &cats = mt_tree.GetCategoriesMatrix();
+  auto const &cats = tree.GetCategoriesMatrix();
   if constexpr (use_array_tree_layout) {
-    ProcessArrayTree<has_categorical, any_missing>(mt_tree, fvec_tloc, block_size, p_nidx, depth);
+    ProcessArrayTree<has_categorical, any_missing>(tree, fvec_tloc, block_size, p_nidx, depth);
   }
   for (std::size_t i = 0; i < block_size; ++i) {
-    bst_node_t nidx = 0;
+    bst_node_t nidx = RegTree::kRoot;
     if constexpr (use_array_tree_layout) {
       nidx = p_nidx[i];
-      p_nidx[i] = 0;
+      p_nidx[i] = RegTree::kRoot;
     }
     auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-    PredValueByOneTree<has_categorical>(fvec_tloc[i], mt_tree, cats, t_predts, nidx);
+    PredValueByOneTree<has_categorical>(fvec_tloc[i], tree, cats, t_predts, nidx);
   }
 }
 }  // namespace multi
 
 namespace {
 template <bool use_array_tree_layout, bool any_missing>
-void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree_begin,
-                            bst_tree_t const tree_end, std::size_t const predict_offset,
+void PredictBlockByAllTrees(HostModel const &model, std::size_t const predict_offset,
                             common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
                             linalg::MatrixView<float> out_predt,
                             const std::vector<int> &tree_depth) {
@@ -150,46 +152,52 @@ void PredictBlockByAllTrees(gbm::GBTreeModel const &model, bst_tree_t const tree
   if constexpr (use_array_tree_layout) {
     nidx.resize(block_size, 0);
   }
-  for (bst_tree_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
-    auto const &tree = *model.trees.at(tree_id);
-    bool has_categorical = tree.HasCategoricalSplit();
-
-    int depth = use_array_tree_layout ? tree_depth[tree_id - tree_begin] : 0;
-    if (tree.IsMultiTarget()) {
-      if (has_categorical) {
-        multi::PredValueByOneTree<true, any_missing, use_array_tree_layout>(
-            tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
-      } else {
-        multi::PredValueByOneTree<false, any_missing, use_array_tree_layout>(
-            tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
-      }
-    } else {
-      auto const gid = model.tree_info[tree_id];
-      if (has_categorical) {
-        scalar::PredValueByOneTree<true, any_missing, use_array_tree_layout>(
-            tree.HostScView(), predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth,
-            gid);
-      } else {
-        scalar::PredValueByOneTree<false, any_missing, use_array_tree_layout>(
-            tree.HostScView(), predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth,
-            gid);
-      }
-    }
+  auto trees = model.Trees();
+  for (bst_tree_t tree_id = 0, n_trees = model.Trees().size(); tree_id < n_trees; ++tree_id) {
+    bst_node_t depth = use_array_tree_layout ? tree_depth[tree_id] : 0;
+    std::visit(
+        enc::Overloaded{
+            [&](tree::ScalarTreeView const &tree) {
+              bool has_categorical = tree.HasCategoricalSplit();
+              auto const gid = model.tree_groups[tree_id];
+              if (has_categorical) {
+                scalar::PredValueByOneTree<true, any_missing, use_array_tree_layout>(
+                    tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth,
+                    gid);
+              } else {
+                scalar::PredValueByOneTree<false, any_missing, use_array_tree_layout>(
+                    tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth,
+                    gid);
+              }
+            },
+            [&](tree::MultiTargetTreeView const &tree) {
+              bool has_categorical = tree.HasCategoricalSplit();
+              if (has_categorical) {
+                multi::PredValueByOneTree<true, any_missing, use_array_tree_layout>(
+                    tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
+              } else {
+                multi::PredValueByOneTree<false, any_missing, use_array_tree_layout>(
+                    tree, predict_offset, fvec_tloc, block_size, out_predt, nidx.data(), depth);
+              }
+            }},
+        trees[tree_id]);
   }
 }
 
 // Dispatch between template implementations
-void DispatchArrayLayout(gbm::GBTreeModel const &model, bst_tree_t const tree_begin,
-                         bst_tree_t const tree_end, std::size_t const predict_offset,
+void DispatchArrayLayout(HostModel const &model, std::size_t const predict_offset,
                          common::Span<RegTree::FVec> fvec_tloc, std::size_t const block_size,
                          linalg::MatrixView<float> out_predt, const std::vector<int> &tree_depth,
                          bool any_missing) {
+  auto n_trees = model.tree_end - model.tree_begin;
+  CHECK_EQ(n_trees, model.Trees().size());
   /*
    * We transform trees to array layout for each block of data to avoid memory overheads.
    * It makes the array layout inefficient for block_size == 1
    */
   const bool use_array_tree_layout = block_size > 1;
   if (use_array_tree_layout) {
+    CHECK_EQ(n_trees, tree_depth.size());
     // Recheck if the current block has missing values.
     if (any_missing) {
       any_missing = false;
@@ -201,15 +209,15 @@ void DispatchArrayLayout(gbm::GBTreeModel const &model, bst_tree_t const tree_be
       }
     }
     if (any_missing) {
-      PredictBlockByAllTrees<true, true>(model, tree_begin, tree_end, predict_offset, fvec_tloc,
-                                         block_size, out_predt, tree_depth);
+      PredictBlockByAllTrees<true, true>(model, predict_offset, fvec_tloc, block_size, out_predt,
+                                         tree_depth);
     } else {
-      PredictBlockByAllTrees<true, false>(model, tree_begin, tree_end, predict_offset, fvec_tloc,
-                                          block_size, out_predt, tree_depth);
+      PredictBlockByAllTrees<true, false>(model, predict_offset, fvec_tloc, block_size, out_predt,
+                                          tree_depth);
     }
   } else {
-    PredictBlockByAllTrees<false, true>(model, tree_begin, tree_end, predict_offset, fvec_tloc,
-                                        block_size, out_predt, tree_depth);
+    PredictBlockByAllTrees<false, true>(model, predict_offset, fvec_tloc, block_size, out_predt,
+                                        tree_depth);
   }
 }
 
@@ -534,15 +542,13 @@ class ThreadTmp {
 };
 
 template <std::size_t kBlockOfRowsSize, typename DataView>
-void PredictBatchByBlockKernel(DataView const &batch, gbm::GBTreeModel const &model,
-                               bst_tree_t tree_begin, bst_tree_t tree_end,
+void PredictBatchByBlockKernel(DataView const &batch, HostModel const &model,
                                ThreadTmp<kBlockOfRowsSize> *p_fvec, std::int32_t n_threads,
-                               bool any_missing,
-                               linalg::TensorView<float, 2> out_predt) {
+                               bool any_missing, linalg::TensorView<float, 2> out_predt) {
   auto &fvec = *p_fvec;
   // Parallel over local batches
   auto const n_samples = batch.Size();
-  auto const n_features = model.learner_model_param->num_feature;
+  auto const n_features = model.n_features;
 
   /* Precalculate depth for each tree.
    * These values are required only for the ArrayLayout optimization,
@@ -550,19 +556,18 @@ void PredictBatchByBlockKernel(DataView const &batch, gbm::GBTreeModel const &mo
    */
   std::vector<int> tree_depth;
   if constexpr (kBlockOfRowsSize > 1) {
-    tree_depth.resize(tree_end - tree_begin);
-    common::ParallelFor(tree_end - tree_begin, n_threads, [&](auto i) {
-      bst_tree_t tree_id = tree_begin + i;
-      tree_depth[i] = model.trees.at(tree_id)->MaxDepth();
+    tree_depth.resize(model.tree_end - model.tree_begin);
+    CHECK_EQ(tree_depth.size(), model.Trees().size());
+    common::ParallelFor(model.tree_end - model.tree_begin, n_threads, [&](auto i) {
+      std::visit([&](auto &&tree) { tree_depth[i] = tree.MaxDepth(); }, model.Trees()[i]);
     });
   }
-
   common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads, [&](auto &&block) {
     auto fvec_tloc = fvec.ThreadBuffer(block.Size());
 
     batch.FVecFill(block, n_features, fvec_tloc);
-    DispatchArrayLayout(model, tree_begin, tree_end, block.begin() + batch.base_rowid, fvec_tloc,
-                        block.Size(), out_predt, tree_depth, any_missing);
+    DispatchArrayLayout(model, block.begin() + batch.base_rowid, fvec_tloc, block.Size(), out_predt,
+                        tree_depth, any_missing);
     batch.FVecDrop(fvec_tloc);
   });
 }
@@ -759,12 +764,12 @@ class ColumnSplitHelper {
 
   bst_node_t GetLeafIndex(tree::ScalarTreeView const &tree, std::size_t tree_id,
                           std::size_t row_id) {
-    bst_node_t nid = 0;
-    while (!tree.IsLeaf(nid)) {
-      auto const bit_index = BitIndex(tree_id, row_id, nid);
-      nid = GetNextNode(tree, nid, bit_index);
+    bst_node_t nidx = RegTree::kRoot;
+    while (!tree.IsLeaf(nidx)) {
+      auto const bit_index = BitIndex(tree_id, row_id, nidx);
+      nidx = GetNextNode(tree, nidx, bit_index);
     }
-    return nid;
+    return nidx;
   }
 
   template <bool predict_leaf = false>
@@ -779,11 +784,12 @@ class ColumnSplitHelper {
   }
 
   template <bool predict_leaf = false>
-  void PredictAllTrees(std::vector<bst_float> *out_preds, std::size_t batch_offset,
+  void PredictAllTrees(common::Span<bst_target_t const> h_tree_groups,
+                       std::vector<bst_float> *out_preds, std::size_t batch_offset,
                        std::size_t predict_offset, std::size_t num_group, std::size_t block_size) {
     auto &preds = *out_preds;
     for (auto tree_id = tree_begin_; tree_id < tree_end_; ++tree_id) {
-      auto const gid = model_.tree_info[tree_id];
+      auto const gid = h_tree_groups[tree_id];
       for (size_t i = 0; i < block_size; ++i) {
         auto const result = PredictOneTree<predict_leaf>(tree_id, batch_offset + i);
         if constexpr (predict_leaf) {
@@ -814,10 +820,11 @@ class ColumnSplitHelper {
     });
 
     AllreduceBitVectors(ctx);
+    auto h_tree_groups = this->model_.TreeGroups(ctx->Device());
 
     common::ParallelFor1d<kBlockOfRowsSize>(n_samples, n_threads_, [&](auto &&block) {
-      PredictAllTrees<predict_leaf>(out_preds, block.begin(), block.begin() + batch.base_rowid,
-                                    num_group, block.Size());
+      PredictAllTrees<predict_leaf>(h_tree_groups, out_preds, block.begin(),
+                                    block.begin() + batch.base_rowid, num_group, block.Size());
     });
 
     ClearBitVectors();
@@ -896,34 +903,34 @@ class CPUPredictor : public Predictor {
     CHECK_EQ(out_preds->size(), n_samples * n_groups);
     auto out_predt = linalg::MakeTensorView(ctx_, *out_preds, n_samples, n_groups);
     bool any_missing = !(p_fmat->IsDense());
+    auto const h_model =
+        HostModel{DeviceOrd::CPU(), model, tree_begin, tree_end, &this->mu_, CopyViews{}};
 
     LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
       using Policy = common::GetValueT<decltype(policy)>;
       ThreadTmp<Policy::kBlockOfRowsSize> feat_vecs{n_threads};
       policy.ForEachBatch([&](auto &&batch) {
-        PredictBatchByBlockKernel<Policy::kBlockOfRowsSize>(batch, model, tree_begin, tree_end,
-                                                            &feat_vecs, n_threads, any_missing,
-                                                            out_predt);
+        PredictBatchByBlockKernel<Policy::kBlockOfRowsSize>(batch, h_model, &feat_vecs, n_threads,
+                                                            any_missing, out_predt);
       });
     });
   }
 
   template <typename DataView>
-  void PredictContributionKernel(DataView batch, const MetaInfo &info,
-                                 const gbm::GBTreeModel &model,
-                                 const std::vector<bst_float> *tree_weights,
+  void PredictContributionKernel(DataView batch, const MetaInfo &info, HostModel const &h_model,
+                                 linalg::VectorView<float const> base_score,
+                                 std::vector<bst_float> const *tree_weights,
                                  std::vector<std::vector<float>> *mean_values,
                                  ThreadTmp<1> *feat_vecs, std::vector<bst_float> *contribs,
-                                 bst_tree_t ntree_limit, bool approximate, int condition,
+                                 bool approximate, int condition,
                                  unsigned condition_feature) const {
-    const int num_feature = model.learner_model_param->num_feature;
-    const int ngroup = model.learner_model_param->num_output_group;
-    CHECK_NE(ngroup, 0);
+    const int num_feature = h_model.n_features;
+    const auto n_groups = h_model.n_groups;
+    CHECK_NE(n_groups, 0);
     size_t const ncolumns = num_feature + 1;
     CHECK_NE(ncolumns, 0);
     auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
     auto base_margin = info.base_margin_.View(device);
-    auto base_score = model.learner_model_param->BaseScore(device);
 
     // parallel over local batch
     common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
@@ -934,22 +941,22 @@ class CPUPredictor : public Predictor {
       }
       std::vector<bst_float> this_tree_contribs(ncolumns);
       // loop over all classes
-      for (int gid = 0; gid < ngroup; ++gid) {
-        bst_float *p_contribs = &(*contribs)[(row_idx * ngroup + gid) * ncolumns];
+      for (bst_target_t gid = 0; gid < n_groups; ++gid) {
+        float *p_contribs = &(*contribs)[(row_idx * n_groups + gid) * ncolumns];
         batch.Fill(i, &feats);
         // calculate contributions
-        for (bst_tree_t j = 0; j < ntree_limit; ++j) {
+        for (bst_tree_t j = 0; j < h_model.tree_end; ++j) {
           auto *tree_mean_values = &mean_values->at(j);
           std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
-          if (model.tree_info[j] != gid) {
+          if (h_model.tree_groups[j] != gid) {
             continue;
           }
+          auto sc_tree = std::get<tree::ScalarTreeView>(h_model.Trees()[j]);
           if (!approximate) {
-            CalculateContributions(model.trees[j]->HostScView(), feats, tree_mean_values,
-                                   &this_tree_contribs[0], condition, condition_feature);
+            CalculateContributions(sc_tree, feats, tree_mean_values, &this_tree_contribs[0],
+                                   condition, condition_feature);
           } else {
-            CalculateContributionsApprox(model.trees[j]->HostScView(), feats, tree_mean_values,
-                                         &this_tree_contribs[0]);
+            CalculateContributionsApprox(sc_tree, feats, tree_mean_values, &this_tree_contribs[0]);
           }
           for (size_t ci = 0; ci < ncolumns; ++ci) {
             p_contribs[ci] +=
@@ -959,7 +966,7 @@ class CPUPredictor : public Predictor {
         feats.Drop();
         // add base margin to BIAS
         if (base_margin.Size() != 0) {
-          CHECK_EQ(base_margin.Shape(1), ngroup);
+          CHECK_EQ(base_margin.Shape(1), n_groups);
           p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
         } else {
           p_contribs[ncolumns - 1] += base_score(gid);
@@ -996,12 +1003,13 @@ class CPUPredictor : public Predictor {
     // Always use block as we don't know the nnz.
     ThreadTmp<BlockPolicy::kBlockOfRowsSize> feat_vecs{n_threads};
     bst_idx_t n_groups = model.learner_model_param->OutputLength();
+    auto const h_model =
+        HostModel{DeviceOrd::CPU(), model, tree_begin, tree_end, &this->mu_, CopyViews{}};
 
     auto kernel = [&](auto &&view) {
       auto out_predt = linalg::MakeTensorView(ctx_, predictions, view.Size(), n_groups);
-      PredictBatchByBlockKernel<BlockPolicy::kBlockOfRowsSize>(view, model, tree_begin, tree_end,
-                                                               &feat_vecs, n_threads, any_missing,
-                                                               out_predt);
+      PredictBatchByBlockKernel<BlockPolicy::kBlockOfRowsSize>(view, h_model, &feat_vecs, n_threads,
+                                                               any_missing, out_predt);
     };
     auto dispatch = [&](auto x) {
       using AdapterT = typename decltype(x)::element_type;
@@ -1050,6 +1058,8 @@ class CPUPredictor : public Predictor {
     auto n_features = model.learner_model_param->num_feature;
     ThreadTmp<1> feat_vecs{n_threads};
 
+    auto const h_model =
+        HostModel{DeviceOrd::CPU(), model, 0, ntree_limit, &this->mu_, CopyViews{}};
     LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
       policy.ForEachBatch([&](auto &&batch) {
         common::ParallelFor1d<1>(batch.Size(), n_threads, [&](auto &&block) {
@@ -1058,17 +1068,12 @@ class CPUPredictor : public Predictor {
           batch.FVecFill(block, n_features, fvec_tloc);
 
           for (bst_tree_t j = 0; j < ntree_limit; ++j) {
-            auto const &tree = *model.trees[j];
-            bst_node_t nidx = 0;
-            if (tree.IsMultiTarget()) {
-              auto mt_tree = tree.HostMtView();
-              nidx = multi::GetLeafIndex<true, true>(mt_tree, fvec_tloc.front(),
-                                                     mt_tree.GetCategoriesMatrix(), nidx);
-            } else {
-              auto sc_tree = tree.HostScView();
-              nidx = scalar::GetLeafIndex<true, true>(tree.HostScView(), fvec_tloc.front(),
-                                                      sc_tree.GetCategoriesMatrix(), nidx);
-            }
+            bst_node_t nidx = std::visit(
+                [&](auto &&tree) {
+                  return GetLeafIndex<true, true>(tree, fvec_tloc.front(),
+                                                  tree.GetCategoriesMatrix(), RegTree::kRoot);
+                },
+                h_model.Trees()[j]);
             preds[ridx * ntree_limit + j] = static_cast<float>(nidx);
           }
           batch.FVecDrop(fvec_tloc);
@@ -1103,11 +1108,14 @@ class CPUPredictor : public Predictor {
       FillNodeMeanValues(model.trees[i]->HostScView(), &(mean_values[i]));
     });
 
+    auto const h_model =
+        HostModel{DeviceOrd::CPU(), model, 0, ntree_limit, &this->mu_, CopyViews{}};
     LaunchPredict(this->ctx_, p_fmat, model, [&](auto &&policy) {
       policy.ForEachBatch([&](auto &&batch) {
-        PredictContributionKernel(batch, info, model, tree_weights, &mean_values, &feat_vecs,
-                                  &contribs, ntree_limit, approximate, condition,
-                                  condition_feature);
+        PredictContributionKernel(batch, info, h_model,
+                                  model.learner_model_param->BaseScore(DeviceOrd::CPU()),
+                                  tree_weights, &mean_values, &feat_vecs, &contribs, approximate,
+                                  condition, condition_feature);
       });
     });
   }
@@ -1167,6 +1175,9 @@ class CPUPredictor : public Predictor {
       }
     }
   }
+
+ private:
+  mutable std::mutex mu_;
 };
 
 XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
diff --git a/src/predictor/gbtree_view.h b/src/predictor/gbtree_view.h
new file mode 100644
index 000000000000..c7acbbad5ba7
--- /dev/null
+++ b/src/predictor/gbtree_view.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#pragma once
+
+#include <mutex>    // for mutex, lock_guard
+#include <utility>  // for move
+#include <vector>   // for vector
+
+#include "../gbm/gbtree_model.h"  // for GBTreeModel
+#include "../tree/tree_view.h"    // for MultiTargetTreeView, ScalarTreeView
+#include "xgboost/base.h"         // for bst_tree_t, bst_target_t
+#include "xgboost/context.h"      // for DeviceOrd
+#include "xgboost/logging.h"      // for CHECK_GT
+#include "xgboost/span.h"         // for Span
+
+namespace xgboost::predictor {
+/**
+ * @brief A view for the boosted trees to ensure thread safety.
+ *
+ *   This class contains a subset of trees based on the input tree range.
+ *
+ * @tparam Container   The container for storing the tree view variants.
+ * @tparam TreeViewVar A std::variant for different view types.
+ * @tparam CopyViews   A policy for how to copy the tree views into the container.
+ */
+template <template <typename> typename Container, typename TreeViewVar, typename CopyViews>
+class GBTreeModelView {
+ private:
+  Container<TreeViewVar> trees_;
+
+ public:
+  bst_tree_t const tree_begin;
+  bst_tree_t const tree_end;
+  common::Span<bst_target_t const> tree_groups;
+  bst_target_t const n_groups;
+  bst_feature_t const n_features;
+  bst_node_t n_nodes{0};
+
+ public:
+  explicit GBTreeModelView(DeviceOrd device, gbm::GBTreeModel const& model, bst_tree_t tree_begin,
+                           bst_tree_t tree_end, std::mutex* p_mu, CopyViews&& copy)
+      : tree_begin{tree_begin},
+        tree_end{tree_end},
+        n_groups{model.learner_model_param->OutputLength()},
+        n_features{model.learner_model_param->num_feature} {
+    // Make sure the trees are pulled to target device without race.
+    std::lock_guard guard{*p_mu};
+    // Create tree views.
+    std::vector<TreeViewVar> trees;
+    for (bst_tree_t tree_idx = this->tree_begin; tree_idx < this->tree_end; ++tree_idx) {
+      auto const& p_tree = model.trees[tree_idx];
+      if (p_tree->IsMultiTarget()) {
+        auto tree = tree::MultiTargetTreeView{device, p_tree.get()};
+        this->n_nodes += tree.Size();
+        trees.emplace_back(tree);
+      } else {
+        auto tree = tree::ScalarTreeView{device, p_tree.get()};
+        this->n_nodes += tree.Size();
+        trees.emplace_back(tree);
+      }
+    }
+
+    copy(&this->trees_, std::move(trees));  // NOLINT[build/include_what_you_use]
+
+    CHECK_GE(this->tree_end, this->tree_begin);
+    auto n_trees = this->tree_end - this->tree_begin;
+    model.tree_info.SetDevice(device);
+    this->tree_groups = model.TreeGroups(device).subspan(this->tree_begin, n_trees);
+    CHECK_EQ(n_trees, this->trees_.size());
+  }
+
+  [[nodiscard]] common::Span<TreeViewVar const> Trees() const {
+    return {trees_.data(), trees_.size()};
+  }
+
+  GBTreeModelView() = delete;
+  GBTreeModelView(GBTreeModelView const&) = delete;
+  GBTreeModelView& operator=(GBTreeModelView const&) = delete;
+  GBTreeModelView(GBTreeModelView&&) = default;
+  GBTreeModelView& operator=(GBTreeModelView&&) = delete;
+};
+}  // namespace xgboost::predictor
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 88ca5c51ad61..368c8b48f5cc 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -27,6 +27,7 @@
 #include "../data/proxy_dmatrix.h"
 #include "../gbm/gbtree_model.h"
 #include "../tree/tree_view.h"
+#include "gbtree_view.h"  // for GBTreeModelView
 #include "predict_fn.h"
 #include "utils.h"  // for CheckProxyDMatrix
 #include "xgboost/data.h"
@@ -322,43 +323,23 @@ __global__ void PredictKernel(Data data, common::Span<TreeViewVar const> d_trees
   }
 }
 
-struct DeviceModel {
-  bst_tree_t tree_begin;
-  bst_tree_t tree_end;
-  dh::device_vector<TreeViewVar> d_trees;
-  dh::device_vector<bst_target_t> d_tree_groups;
-  bst_target_t n_groups;
-  bst_feature_t n_features;
-  bst_node_t n_nodes{0};
-
- public:
-  explicit DeviceModel(Context const* ctx, gbm::GBTreeModel const& model, bst_tree_t tree_begin,
-                       bst_tree_t tree_end, std::mutex* p_mu)
-      : tree_begin{tree_begin},
-        tree_end{tree_end},
-        n_groups{model.learner_model_param->OutputLength()},
-        n_features{model.learner_model_param->num_feature} {
-    std::lock_guard guard{*p_mu};
-    std::vector<TreeViewVar> trees;
-    for (bst_tree_t tree_idx = this->tree_begin; tree_idx < this->tree_end; ++tree_idx) {
-      auto const& p_tree = model.trees[tree_idx];
-      if (p_tree->IsMultiTarget()) {
-        auto d_tree = tree::MultiTargetTreeView{ctx, p_tree.get()};
-        this->n_nodes += d_tree.Size();
-        trees.emplace_back(d_tree);
-      } else {
-        auto d_tree = tree::ScalarTreeView{ctx, p_tree.get()};
-        this->n_nodes += d_tree.Size();
-        trees.emplace_back(d_tree);
-      }
-    }
+namespace {
+struct CopyViews {
+  Context const* ctx;
+  explicit CopyViews(Context const* ctx) : ctx{ctx} {}
 
-    this->d_trees = trees;
-    this->d_tree_groups = model.tree_info;
-    CHECK_GT(this->tree_end, this->tree_begin);
+  void operator()(dh::DeviceUVector<TreeViewVar>* p_dst, std::vector<TreeViewVar>&& src) {
+    xgboost_NVTX_FN_RANGE();
+    p_dst->resize(src.size());
+    auto d_dst = dh::ToSpan(*p_dst);
+    dh::safe_cuda(cudaMemcpyAsync(d_dst.data(), src.data(), d_dst.size_bytes(), cudaMemcpyDefault,
+                                  ctx->CUDACtx()->Stream()));
   }
 };
 
+using DeviceModel = GBTreeModelView<dh::DeviceUVector, TreeViewVar, CopyViews>;
+}  // namespace
+
 struct ShapSplitCondition {
   ShapSplitCondition() = default;
   XGBOOST_DEVICE
@@ -459,7 +440,7 @@ void ExtractPaths(Context const* ctx,
 
   // Path length and tree index for all leaf nodes
   dh::caching_device_vector<PathInfo> info(d_model.n_nodes);
-  auto d_trees = dh::ToSpan(d_model.d_trees);
+  auto d_trees = d_model.Trees();  // subset of trees
   auto tree_segments = MakeTreeSegments(ctx, d_model.tree_begin, d_model.tree_end, h_model);
   CHECK_EQ(tree_segments.ConstHostVector().back(), d_model.n_nodes);
   auto d_tree_segments = tree_segments.ConstDeviceSpan();
@@ -499,7 +480,7 @@ void ExtractPaths(Context const* ctx,
 
   auto d_paths = dh::ToSpan(*paths);
   auto d_info = info.data().get();
-  auto d_tree_groups = dh::ToSpan(d_model.d_tree_groups);
+  auto d_tree_groups = d_model.tree_groups;
   auto d_path_segments = path_segments.data().get();
 
   std::size_t max_cat = 0;
@@ -742,17 +723,16 @@ class ColumnSplitHelper {
 
       SparsePageView data{ctx_, batch, num_features};
       auto const grid = static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
-      auto d_tree_groups = dh::ToSpan(d_model.d_tree_groups)
-                               .subspan(d_model.tree_begin, d_model.tree_end - d_model.tree_begin);
+      auto d_tree_groups = d_model.tree_groups;
       dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
-          MaskBitVectorKernel, data, dh::ToSpan(d_model.d_trees), decision_bits, missing_bits,
+          MaskBitVectorKernel, data, d_model.Trees(), decision_bits, missing_bits,
           d_model.tree_begin, d_model.tree_end, num_features, num_nodes, use_shared,
           std::numeric_limits<float>::quiet_NaN());
 
       AllReduceBitVectors(&decision_storage, &missing_storage);
 
       dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()}(
-          PredictByBitVectorKernel<predict_leaf>, dh::ToSpan(d_model.d_trees),
+          PredictByBitVectorKernel<predict_leaf>, d_model.Trees(),
           out_preds->DeviceSpan().subspan(batch_offset), d_tree_groups,
           decision_bits, missing_bits, d_model.tree_begin, d_model.tree_end, num_rows, num_nodes,
           num_group);
@@ -854,9 +834,8 @@ class LaunchConfig {
                            HostDeviceVector<float>* predictions) {
     auto kernel = PredictKernel<typename Loader::Type, common::GetValueT<decltype(batch)>,
                                 HasMissing(), EncAccessorT>;
-    auto d_tree_groups = dh::ToSpan(d_model.d_tree_groups)
-                             .subspan(d_model.tree_begin, d_model.tree_end - d_model.tree_begin);
-    this->Launch<Loader>(kernel, std::move(batch), dh::ToSpan(d_model.d_trees),
+    auto d_tree_groups = d_model.tree_groups;
+    this->Launch<Loader>(kernel, std::move(batch), d_model.Trees(),
                          predictions->DeviceSpan().subspan(batch_offset), d_tree_groups, n_features,
                          this->UseShared(), d_model.n_groups, missing, acc);
   }
@@ -995,7 +974,8 @@ class GPUPredictor : public xgboost::Predictor {
     out_preds->SetDevice(ctx_->Device());
     auto const& info = p_fmat->Info();
 
-    DeviceModel d_model{this->ctx_, model, tree_begin, tree_end, &this->model_mu_};
+    DeviceModel d_model{this->ctx_->Device(), model, tree_begin, tree_end, &this->model_mu_,
+                        CopyViews{this->ctx_}};
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictBatch(p_fmat, out_preds, model, d_model);
@@ -1056,7 +1036,8 @@ class GPUPredictor : public xgboost::Predictor {
     auto n_samples = m->NumRows();
     auto n_features = model.learner_model_param->num_feature;
 
-    DeviceModel d_model{ctx_, model, tree_begin, tree_end, &this->model_mu_};
+    DeviceModel d_model{ctx_->Device(),       model, tree_begin, tree_end, &this->model_mu_,
+                        CopyViews{this->ctx_}};
 
     if constexpr (std::is_same_v<Adapter, data::CudfAdapter>) {
       if (m->HasCategorical()) {
@@ -1135,7 +1116,8 @@ class GPUPredictor : public xgboost::Predictor {
     auto phis = out_contribs->DeviceSpan();
 
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> device_paths;
-    DeviceModel d_model{this->ctx_, model, 0, tree_end, &this->model_mu_};
+    DeviceModel d_model{this->ctx_->Device(), model, 0, tree_end, &this->model_mu_,
+                        CopyViews{this->ctx_}};
 
     auto new_enc =
         p_fmat->Cats()->NeedRecode() ? p_fmat->Cats()->DeviceView(ctx_) : enc::DeviceColumnsView{};
@@ -1195,7 +1177,8 @@ class GPUPredictor : public xgboost::Predictor {
     auto phis = out_contribs->DeviceSpan();
 
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> device_paths;
-    DeviceModel d_model{this->ctx_, model, 0, tree_end, &this->model_mu_};
+    DeviceModel d_model{this->ctx_->Device(), model, 0, tree_end, &this->model_mu_,
+                        CopyViews{this->ctx_}};
 
     dh::device_vector<uint32_t> categories;
     ExtractPaths(ctx_, &device_paths, model, d_model, &categories);
@@ -1240,7 +1223,8 @@ class GPUPredictor : public xgboost::Predictor {
     predictions->SetDevice(ctx_->Device());
     predictions->Resize(n_samples * tree_end);
 
-    DeviceModel d_model{ctx_, model, 0, tree_end, &this->model_mu_};
+    DeviceModel d_model{ctx_->Device(),       model, 0, tree_end, &this->model_mu_,
+                        CopyViews{this->ctx_}};
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
@@ -1258,7 +1242,7 @@ class GPUPredictor : public xgboost::Predictor {
         using Config = common::GetValueT<decltype(cfg)>;
         auto kernel = PredictLeafKernel<typename Loader::Type, common::GetValueT<decltype(batch)>,
                                         Config::HasMissing(), typename Config::EncAccessorT>;
-        cfg.template Launch<Loader>(kernel, std::move(batch), dh::ToSpan(d_model.d_trees),
+        cfg.template Launch<Loader>(kernel, std::move(batch), d_model.Trees(),
                                     predictions->DeviceSpan().subspan(batch_offset),
                                     d_model.tree_begin, d_model.tree_end, n_features,
                                     cfg.UseShared(), std::numeric_limits<float>::quiet_NaN(),
diff --git a/src/tree/tree_view.cc b/src/tree/tree_view.cc
index e01fd39796bc..25946dcc724c 100644
--- a/src/tree/tree_view.cc
+++ b/src/tree/tree_view.cc
@@ -11,11 +11,11 @@
 namespace xgboost::tree {
 namespace {
 template <typename T>
-auto DispatchPtr(Context const* ctx, HostDeviceVector<T> const& vec) {
-  if (ctx->IsCPU()) {
+auto DispatchPtr(DeviceOrd device, HostDeviceVector<T> const& vec) {
+  if (device.IsCPU()) {
     return vec.ConstHostPointer();
   }
-  vec.SetDevice(ctx->Device());
+  vec.SetDevice(device);
   return vec.ConstDevicePointer();
 }
 
@@ -29,24 +29,24 @@ auto DispatchWeight(DeviceOrd device, RegTree const* tree) {
 }
 }  // namespace
 
-ScalarTreeView::ScalarTreeView(Context const* ctx, RegTree const* tree)
-    : CategoriesMixIn{tree->GetCategoriesMatrix(ctx->Device())},
-      nodes{tree->GetNodes(ctx->Device()).data()},
-      stats{tree->GetStats(ctx->Device()).data()},
+ScalarTreeView::ScalarTreeView(DeviceOrd device, RegTree const* tree)
+    : CategoriesMixIn{tree->GetCategoriesMatrix(device)},
+      nodes{tree->GetNodes(device).data()},
+      stats{tree->GetStats(device).data()},
       n{tree->NumNodes()} {
   CHECK(!tree->IsMultiTarget());
 }
 
-MultiTargetTreeView::MultiTargetTreeView(Context const* ctx, RegTree const* tree)
-    : CategoriesMixIn{tree->GetCategoriesMatrix(ctx->Device())},
-      left{DispatchPtr(ctx, tree->GetMultiTargetTree()->left_)},
-      right{DispatchPtr(ctx, tree->GetMultiTargetTree()->right_)},
-      parent{DispatchPtr(ctx, tree->GetMultiTargetTree()->parent_)},
-      split_index{DispatchPtr(ctx, tree->GetMultiTargetTree()->split_index_)},
-      default_left{DispatchPtr(ctx, tree->GetMultiTargetTree()->default_left_)},
-      split_conds{DispatchPtr(ctx, tree->GetMultiTargetTree()->split_conds_)},
+MultiTargetTreeView::MultiTargetTreeView(DeviceOrd device, RegTree const* tree)
+    : CategoriesMixIn{tree->GetCategoriesMatrix(device)},
+      left{DispatchPtr(device, tree->GetMultiTargetTree()->left_)},
+      right{DispatchPtr(device, tree->GetMultiTargetTree()->right_)},
+      parent{DispatchPtr(device, tree->GetMultiTargetTree()->parent_)},
+      split_index{DispatchPtr(device, tree->GetMultiTargetTree()->split_index_)},
+      default_left{DispatchPtr(device, tree->GetMultiTargetTree()->default_left_)},
+      split_conds{DispatchPtr(device, tree->GetMultiTargetTree()->split_conds_)},
       n{tree->NumNodes()},
-      weights{DispatchWeight(ctx->Device(), tree)} {
+      weights{DispatchWeight(device, tree)} {
   CHECK(tree->IsMultiTarget());
 }
 
diff --git a/src/tree/tree_view.h b/src/tree/tree_view.h
index 19fafaabb701..5048be8aa0f2 100644
--- a/src/tree/tree_view.h
+++ b/src/tree/tree_view.h
@@ -12,6 +12,7 @@
 
 #include "../common/type.h"      // for GetValueT
 #include "xgboost/base.h"        // for bst_node_t
+#include "xgboost/context.h"     // for DeviceOrd
 #include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost::tree {
@@ -152,7 +153,7 @@ struct ScalarTreeView : public WalkTreeMixIn<ScalarTreeView>, public CategoriesM
       : CategoriesMixIn{std::move(cats)}, nodes{nodes}, stats{stats}, n{n_nodes} {}
 
   /** @brief Create a device view */
-  explicit ScalarTreeView(Context const* ctx, RegTree const* tree);
+  explicit ScalarTreeView(DeviceOrd device, RegTree const* tree);
   /** @brief Create a host view */
   explicit ScalarTreeView(RegTree const* tree)
       : CategoriesMixIn{tree->GetCategoriesMatrix(DeviceOrd::CPU())},
@@ -221,7 +222,7 @@ struct MultiTargetTreeView : public WalkTreeMixIn<MultiTargetTreeView>, public C
     return 0.0f;
   }
   /** @brief Create a device view */
-  explicit MultiTargetTreeView(Context const* ctx, RegTree const* tree);
+  explicit MultiTargetTreeView(DeviceOrd device, RegTree const* tree);
   /** @brief Create a host view */
   explicit MultiTargetTreeView(RegTree const* tree);
 };
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 701bb8864dcd..abe0abccba8a 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -120,6 +120,7 @@ TEST(GBTree, WrongUpdater) {
 #ifdef XGBOOST_USE_CUDA
 TEST(GBTree, ChoosePredictor) {
   // The test ensures data don't get pulled into device.
+  // XGBoost chooses predictor based on the data placement when input is a SparsePage.
   std::size_t constexpr kRows = 17, kCols = 15;
 
   auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index f4916c4efcb0..66d3e312c76b 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -78,7 +78,7 @@ TEST(CpuPredictor, ArrayTreeLayout) {
     tree.ExpandNode(nid, split_index, split_cond, default_left, 0, 0, 0, 0, 0, 0, 0);
   }
 
-  auto sc_tree = tree::ScalarTreeView{&ctx, &tree};
+  auto sc_tree = tree::ScalarTreeView{ctx.Device(), &tree};
   {
     constexpr int kDepth = 1;
     LayoutForTest<kDepth> buffer(sc_tree, sc_tree.GetCategoriesMatrix());
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 98f487780de6..01f19dec6a12 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -35,7 +35,8 @@ TEST(GPUPredictor, Basic) {
 
     auto ctx = MakeCUDACtx(0);
     LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
-    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+    std::unique_ptr<gbm::GBTreeModel> p_model = CreateTestModel(&mparam, &ctx);
+    auto const& model = *p_model;
 
     // Test predict batch
     PredictionCacheEntry gpu_out_predictions;
@@ -71,7 +72,8 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
     std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
     LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
-    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+    std::unique_ptr<gbm::GBTreeModel> p_model = CreateTestModel(&mparam, &ctx);
+    auto const& model = *p_model;
 
     // Test predict batch
     PredictionCacheEntry out_predictions;
@@ -99,7 +101,8 @@ TEST_F(MGPUPredictorTest, BasicColumnSplit) {
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
     LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
-    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+    std::unique_ptr<gbm::GBTreeModel> p_model = CreateTestModel(&mparam, &ctx);
+    auto const& model = *p_model;
 
     // Test predict batch
     PredictionCacheEntry out_predictions;
@@ -152,7 +155,8 @@ void TestDecisionStumpExternalMemory(Context const* ctx, bst_feature_t n_feature
                                      Create create_fn) {
   std::int32_t n_classes = 3;
   LearnerModelParam mparam{MakeMP(n_features, .5, n_classes, ctx->Device())};
-  auto model = CreateTestModel(&mparam, ctx, n_classes);
+  std::unique_ptr<gbm::GBTreeModel> p_model = CreateTestModel(&mparam, ctx, n_classes);
+  auto const& model = *p_model;
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", ctx));
   gpu_predictor->Configure({});
@@ -341,7 +345,8 @@ TEST(GPUPredictor, PredictLeafBasic) {
 
   LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
   Context ctx;
-  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+  std::unique_ptr<gbm::GBTreeModel> p_model = CreateTestModel(&mparam, &ctx);
+  auto const& model = *p_model;
 
   HostDeviceVector<float> leaf_out_predictions;
   gpu_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index a03815da0eb0..615407f7e15b 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -36,7 +36,8 @@ void TestBasic(DMatrix* dmat, Context const *ctx) {
 
   LearnerModelParam mparam{MakeMP(kCols, .0, 1, ctx->Device())};
 
-  gbm::GBTreeModel model = CreateTestModel(&mparam, ctx);
+  std::unique_ptr<gbm::GBTreeModel> p_model = CreateTestModel(&mparam, ctx);
+  auto const &model = *p_model;
 
   // Test predict batch
   PredictionCacheEntry out_predictions;
@@ -731,8 +732,7 @@ void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsit
 }
 
 void TestVectorLeafPrediction(Context const *ctx) {
-  std::unique_ptr<Predictor> predictor{ctx->IsCUDA() ? Predictor::Create("gpu_predictor", ctx)
-                                                     : Predictor::Create("cpu_predictor", ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
 
   size_t constexpr kRows = 5;
   size_t constexpr kCols = 5;
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 9587cc8c7260..f9e52b86e5a9 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -14,9 +14,9 @@
 #include "../helpers.h"
 
 namespace xgboost {
-inline gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, Context const* ctx,
-                                        size_t n_classes = 1) {
-  gbm::GBTreeModel model(param, ctx);
+inline std::unique_ptr<gbm::GBTreeModel> CreateTestModel(LearnerModelParam const* param,
+                                                         Context const* ctx, size_t n_classes = 1) {
+  auto model = std::make_unique<gbm::GBTreeModel>(param, ctx);
 
   for (size_t i = 0; i < n_classes; ++i) {
     std::vector<std::unique_ptr<RegTree>> trees;
@@ -25,7 +25,7 @@ inline gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, Context
       (*trees.back())[0].SetLeaf(1.5f);
       (*trees.back()).Stat(0).sum_hess = 1.0f;
     }
-    model.CommitModelGroup(std::move(trees), i);
+    model->CommitModelGroup(std::move(trees), i);
   }
 
   return model;
@@ -54,7 +54,8 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col
       std::unique_ptr<Predictor>(CreatePredictorForTest(&cuda_ctx));
   predictor->Configure({});
 
-  gbm::GBTreeModel model = CreateTestModel(&mparam, ctx, kClasses);
+  std::unique_ptr<gbm::GBTreeModel> p_model = CreateTestModel(&mparam, ctx, kClasses);
+  auto const& model = *p_model;
 
   {
     auto p_precise = RandomDataGenerator(rows, cols, 0).GenerateDMatrix();

From 34c809a9fac81f3e615e99a27c7fe0b73038de3a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Oct 2025 20:16:15 +0800
Subject: [PATCH 205/224] Fix RTD document download. (#11772)

---
 doc/conf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 9927edbe4170..b467fb37e172 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -84,7 +84,9 @@ def is_id():
     elif branch.startswith("release_"):
         pass  # release branch, like: release_2.1.0
     elif branch == "stable":
-        branch = f"release_{xgboost.__version__}"
+        # Avoid patch release branch.
+        v = xgboost.__version__.split(".")
+        branch = f"release_{v[0]}.{v[1]}.0"
     elif is_id():
         # Likely PR branch
         branch = f"PR-{branch}"

From 82d92e55564adddd388a2e039dd4f7bb0852f0cb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Oct 2025 20:20:41 +0800
Subject: [PATCH 206/224] Release news for 3.1.1. (#11770)

---
 doc/changes/v3.1.0.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/changes/v3.1.0.rst b/doc/changes/v3.1.0.rst
index 6db6eef4c3cf..227e8a369314 100644
--- a/doc/changes/v3.1.0.rst
+++ b/doc/changes/v3.1.0.rst
@@ -1,3 +1,12 @@
+#################################
+3.1.1 Patch Release (Oct 22 2025)
+#################################
+
+- Emit correct error when performing inplace-predict using a CPU-only version of XGBoost,
+  but with a GPU input. (:pr:`11761`)
+- Enhance the error message for loading the removed binary model format. (:pr:`11760`)
+- Use the correct group ID for SHAP when the intercept is a vector. (:pr:`11764`)
+
 ###################
 3.1.0 (2025 Sep 22)
 ###################

From 279c4cbdd6e52e53a3714646f75eb41fb1dc49b5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 23 Oct 2025 07:18:27 +0800
Subject: [PATCH 207/224] [doc] Fix code blocks in the R migration guide.
 (#11773)

---
 doc/R-package/migration_guide.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/R-package/migration_guide.rst b/doc/R-package/migration_guide.rst
index 9ce8efb110b0..18e03e743a0d 100644
--- a/doc/R-package/migration_guide.rst
+++ b/doc/R-package/migration_guide.rst
@@ -40,7 +40,7 @@ XGBoost's R language bindings had large breaking changes between versions 1.x an
     - The structure of these objects has been modified - now they are represented as a simple R "ALTLIST" (a special kind of 'list' object) with additional attributes.
     - These objects now cannot be modified by adding more fields to them, but metadata for them can be added as attributes.
     - The objects distinguish between two types of attributes:
-        
+
         - R-side attributes (which can be accessed and modified through R function ``attributes(model)`` and ``attributes(model)$field <- val``), which allow arbitrary objects. Many attributes are automatically added by the model building functions, such as evaluation logs (a ``data.table`` with metrics calculated per iteration), which previously were model fields.
         - C-level attributes, which allow only JSON-compliant data and which can be accessed and set through function ``xgb.attributes(model)``. These C-level attributes are shareable through serialized models in different XGBoost interfaces, while the R-level ones are specific to the R interface. Some attributes that are standard among language bindings of XGBoost, such as the best interation, are kept as C attributes.
     - Previously, models that were just de-serialized from an on-disk format required calling method 'xgb.Booster.complete' on them to finish the full de-serialization process before being usable, or would otherwise call this method on their own automatically automatically at the first call to 'predict'. Serialization is now handled more gracefully, and there are no additional functions/methods involved - i.e. if one saves a model to disk with ``saveRDS()`` and then reads it back with ``readRDS()``, the model will be fully loaded straight away, without needing to call additional methods on it.
@@ -53,11 +53,13 @@ By default, XGBoost might recognize that some parameter has been removed or rena
 These behaviors will be removed in future versions, and function calls which currently return deprecation warnings will stop working in the future, so in order to make sure that code calling XGBoost will still keep working, it should be ensured that it doesn't issue deprecation warnings.
 
 Optionally, these deprecation warnings can be turned into errors (while still keeping other types of warnings as warnings) through an option "xgboost.strict_mode" - example:
+
 .. code-block:: r
 
     options("xgboost.strict_mode" = TRUE)
 
 It can also be controlled through an environment variable `XGB_STRICT_MODE=1`, which takes precende over the R option - e.g.:
+
 .. code-block:: r
 
     Sys.setenv("XGB_STRICT_MODE" = "1")

From feabaf6912ac25d0ce13eb711717749b0a5e69cd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 24 Oct 2025 02:37:59 +0800
Subject: [PATCH 208/224] [doc] Cleanup C doc notations and enhance categorical
 docs. (#11774)

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 doc/tutorials/categorical.rst |  24 +-
 include/xgboost/c_api.h       | 850 +++++++++++++++++-----------------
 2 files changed, 449 insertions(+), 425 deletions(-)

diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index c3c7d173078b..0d64ac562b28 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -2,6 +2,12 @@
 Categorical Data
 ################
 
+**Contents**
+
+.. contents::
+  :backlinks: none
+  :local:
+
 Since version 1.5, XGBoost has support for categorical data.  For numerical data, the
 split condition is defined as :math:`value < threshold`, while for categorical data the
 split is defined depending on whether partitioning or onehot encoding is used. For
@@ -140,7 +146,7 @@ Auto-recoding (Data Consistency)
 
 .. versionchanged:: 3.1
 
-  Starting with XGBoost 3.1, the *Python* interface can perform automatic re-coding for
+  Starting with XGBoost 3.1, the **Python** interface can perform automatic re-coding for
   new inputs.
 
 XGBoost accepts parameters to indicate which feature is considered categorical, either
@@ -158,11 +164,11 @@ the input and hence cannot store it in the model. The mapping usually happens in
 users' data engineering pipeline. To ensure the correct result from XGBoost, users need to
 keep the pipeline for transforming data consistent across training and testing data.
 
-Starting with 3.1, the *Python* interface can remember the encoding and perform recoding
+Starting with 3.1, the **Python** interface can remember the encoding and perform recoding
 during inference and training continuation when the input is a dataframe (`pandas`,
 `cuDF`, `polars`, `pyarrow`, `modin`). The feature support focuses on basic usage. It has
-some restrictions on the types of inputs that can be accepted. First, category names
-must have one of the following types:
+some restrictions on the types of inputs that can be accepted. First, category names must
+have one of the following types:
 
 - string
 - integer, from 8-bit to 64-bit, both signed and unsigned are supported.
@@ -224,9 +230,15 @@ of using the native interface:
 No extra step is required for using the scikit-learn interface as long as the inputs are
 dataframes. During training continuation, XGBoost will either extract the categories from
 the previous model or use the categories from the new training dataset if the input model
-doesn't have the information.
+doesn't have the information. As a side note, users can inspect the content of the
+categories by exporting it to arrow arrays. This interface is still experimental:
+
+.. code-block:: python
+
+  categories = booster.get_categories(export_to_arrow=True)
+  print(categories.to_arrow())
 
-For R, the auto-recoding is not yet supported as of 3.1. To provide an example:
+For **R**, the auto-recoding is not yet supported as of 3.1. To provide an example:
 
 .. code-block:: R
 
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 0cdb7b987f41..6237af7b5196 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1,8 +1,7 @@
 /**
  * Copyright 2015-2025, XGBoost Contributors
- * \file c_api.h
- * \author Tianqi Chen
- * \brief C API of XGBoost, used for interfacing to other languages.
+ *
+ * @brief C API of XGBoost, used to interface with other high-level languages.
  */
 #ifndef XGBOOST_C_API_H_
 #define XGBOOST_C_API_H_
@@ -31,7 +30,7 @@ typedef uint64_t bst_ulong;  // NOLINT(*)
 /**
  * @mainpage
  *
- * \brief XGBoost C API reference.
+ * @brief XGBoost C API reference.
  *
  * For the official document page see:
  * <a href="https://xgboost.readthedocs.io/en/stable/c.html">XGBoost C Package</a>.
@@ -46,66 +45,69 @@ typedef uint64_t bst_ulong;  // NOLINT(*)
  * @{
  */
 
-/*! \brief handle to DMatrix */
+/** @brief handle to DMatrix */
 typedef void *DMatrixHandle;  // NOLINT(*)
-/*! \brief handle to Booster */
+/** @brief handle to Booster */
 typedef void *BoosterHandle;  // NOLINT(*)
 
-/*!
- * \brief Return the version of the XGBoost library being currently used.
+/**
+ * @brief Return the version of the XGBoost library.
  *
- *  The output variable is only written if it's not NULL.
+ *   The output variable is only written if it's not NULL.
  *
- * \param major Store the major version number
- * \param minor Store the minor version number
- * \param patch Store the patch (revision) number
+ * @param major Store the major version number.
+ * @param minor Store the minor version number.
+ * @param patch Store the patch (revision) number.
  */
 XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch);
 
-/*!
- * \brief Get compile information of shared library.
+/**
+ * @brief Get compile information of the shared XGBoost library.
  *
- * \param out string encoded JSON object containing build flags and dependency version.
+ * @param out string encoded JSON object containing build flags and dependency versions.
  *
- * \return 0 for success, -1 for failure
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBuildInfo(char const **out);
 
-/*!
- * \brief get string message of the last error
+/**
+ * @brief Get the string message of the last error.
+ *
+ *   Most functions in XGBoost returns 0 when success and non-zero when an error
+ *   occurred. In the case of error, @ref XGBGetLastError can be used to retrieve the
+ *   error message
  *
- *  all function in this file will return 0 when success
- *  and -1 when an error occurred,
- *  XGBGetLastError can be called to retrieve the error
+ *   This function is thread safe.
  *
- *  this function is thread safe and can be called by different thread
- * \return const char* error information
+ * @return The error message from the last error.
  */
 XGB_DLL const char *XGBGetLastError();
 
-/*!
- * \brief register callback function for LOG(INFO) messages -- helpful messages
+/**
+ * @brief register callback function for LOG(INFO) messages -- helpful messages
  *        that are not errors.
- * Note: this function can be called by multiple threads. The callback function
- *       will run on the thread that registered it
- * \return 0 for success, -1 for failure
+ *
+ * @note This function can be called by multiple threads. The callback function
+ *       will run on the thread that registered it.
+ *
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBRegisterLogCallback(void (*callback)(const char*));
 
-/*!
- * \brief Set global configuration (collection of parameters that apply globally). This function
+/**
+ * @brief Set global configuration (collection of parameters that apply globally). This function
  *        accepts the list of key-value pairs representing the global-scope parameters to be
  *        configured. The list of key-value pairs are passed in as a JSON string.
- * \param config a JSON string representing the list of key-value pairs. The JSON object shall
+ * @param config a JSON string representing the list of key-value pairs. The JSON object shall
  *                 be flat: no value can be a JSON object or an array.
- * \return 0 for success, -1 for failure
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBSetGlobalConfig(char const *config);
 
-/*!
- * \brief Get current global configuration (collection of parameters that apply globally).
- * \param out_config pointer to received returned global configuration, represented as a JSON string.
- * \return 0 for success, -1 for failure
+/**
+ * @brief Get current global configuration (collection of parameters that apply globally).
+ * @param out_config pointer to received returned global configuration, represented as a JSON string.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBGetGlobalConfig(char const **out_config);
 
@@ -124,28 +126,31 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
  * @{
  */
 
-/*!
- * \brief load a data matrix
- * \deprecated since 2.0.0
- * \see XGDMatrixCreateFromURI()
- * \param fname the name of the file
- * \param silent whether print messages during loading
- * \param out a loaded data matrix
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief load a data matrix
+ *
+ * @deprecated since 2.0.0
+ * @see XGDMatrixCreateFromURI()
+ * @param fname the name of the file
+ * @param silent whether print messages during loading
+ * @param out a loaded data matrix
+ *
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle *out);
 
-/*!
- * \brief load a data matrix
- * \param config JSON encoded parameters for DMatrix construction.  Accepted fields are:
+/**
+ * @brief load a data matrix
+ *
+ * @param config JSON encoded parameters for DMatrix construction.  Accepted fields are:
  *   - uri: The URI of the input file. The URI parameter `format` is required when loading text data.
- *          \verbatim embed:rst:leading-asterisk
+ *          @verbatim embed:rst:leading-asterisk
  *            See :doc:`/tutorials/input_format` for more info.
- *          \endverbatim
+ *          @endverbatim
  *   - silent (optional): Whether to print message during loading. Default to true.
  *   - data_split_mode (optional): Whether the file was split by row or column beforehand for distributed computing. Default to row.
- * \param out a loaded data matrix
- * \return 0 when success, -1 when failure happens
+ * @param out a loaded data matrix
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromURI(char const *config, DMatrixHandle *out);
 
@@ -255,29 +260,29 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
                                    bst_ulong nrow, char const *config, DMatrixHandle *out);
 
 
-/*!
- * \brief create matrix content from dense matrix
- * \param data pointer to the data space
- * \param nrow number of rows
- * \param ncol number columns
- * \param missing which value to represent missing value
- * \param out created dmatrix
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief create matrix content from dense matrix
+ * @param data pointer to the data space
+ * @param nrow number of rows
+ * @param ncol number columns
+ * @param missing which value to represent missing value
+ * @param out created dmatrix
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromMat(const float *data,
                                    bst_ulong nrow,
                                    bst_ulong ncol,
                                    float missing,
                                    DMatrixHandle *out);
-/*!
- * \brief create matrix content from dense matrix
- * \param data pointer to the data space
- * \param nrow number of rows
- * \param ncol number columns
- * \param missing which value to represent missing value
- * \param out created dmatrix
- * \param nthread number of threads (up to maximum cores available, if <=0 use all cores)
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief create matrix content from dense matrix
+ * @param data pointer to the data space
+ * @param nrow number of rows
+ * @param ncol number columns
+ * @param missing which value to represent missing value
+ * @param out created dmatrix
+ * @param nthread number of threads (up to maximum cores available, if <=0 use all cores)
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromMat_omp(const float *data,  // NOLINT
                                        bst_ulong nrow, bst_ulong ncol,
@@ -298,15 +303,15 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const float *data,  // NOLINT
 XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, char const *config,
                                             DMatrixHandle *out);
 
-/*!
- * \brief Create DMatrix from CUDA array.
- * \param data JSON encoded __cuda_array_interface__ for array data.
- * \param config JSON encoded configuration.  Required values are:
+/**
+ * @brief Create DMatrix from CUDA array.
+ * @param data JSON encoded __cuda_array_interface__ for array data.
+ * @param config JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
  *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
- * \param out created dmatrix
- * \return 0 when success, -1 when failure happens
+ * @param out created dmatrix
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *config,
                                                   DMatrixHandle *out);
@@ -364,19 +369,19 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *
 
 /*  ==== First set of callback functions, used exclusively by JVM packages. ==== */
 
-/*! \brief handle to a external data iterator */
+/** @brief handle to a external data iterator */
 typedef void *DataIterHandle;  // NOLINT(*)
 /** @brief handle to an internal data holder. */
 typedef void *DataHolderHandle;  // NOLINT(*)
 
 
-/*! \brief Mini batch used in XGBoost Data Iteration */
+/** @brief Mini batch used in XGBoost Data Iteration */
 typedef struct {  // NOLINT(*)
-  /*! \brief number of rows in the minibatch */
+  /** @brief number of rows in the minibatch */
   size_t size;
-  /* \brief number of columns in the minibatch. */
+  /** @brief number of columns in the minibatch. */
   size_t columns;
-  /*! \brief row pointer to the rows in the data */
+  /** @brief row pointer to the rows in the data */
 #ifdef __APPLE__
   /* Necessary as Java on MacOS defines jlong as long int
    * and gcc defines int64_t as long long int. */
@@ -384,47 +389,47 @@ typedef struct {  // NOLINT(*)
 #else
   int64_t* offset;  // NOLINT(*)
 #endif  // __APPLE__
-  /*! \brief labels of each instance */
+  /** @brief labels of each instance */
   float* label;
-  /*! \brief weight of each instance, can be NULL */
+  /** @brief weight of each instance, can be NULL */
   float* weight;
-  /*! \brief feature index */
+  /** @brief feature index */
   int* index;
-  /*! \brief feature values */
+  /** @brief feature values */
   float* value;
 } XGBoostBatchCSR;
 
-/*!
- * \brief Callback to set the data to handle,
- * \param handle The handle to the callback.
- * \param batch The data content to be set.
+/**
+ * @brief Callback to set the data to handle,
+ * @param handle The handle to the callback.
+ * @param batch The data content to be set.
  */
 XGB_EXTERN_C typedef int XGBCallbackSetData(  // NOLINT(*)
     DataHolderHandle handle, XGBoostBatchCSR batch);
 
-/*!
- * \brief The data reading callback function.
+/**
+ * @brief The data reading callback function.
  *  The iterator will be able to give subset of batch in the data.
  *
  *  If there is data, the function will call set_function to set the data.
  *
- * \param data_handle The handle to the callback.
- * \param set_function The batch returned by the iterator
- * \param set_function_handle The handle to be passed to set function.
- * \return 0 if we are reaching the end and batch is not returned.
+ * @param data_handle The handle to the callback.
+ * @param set_function The batch returned by the iterator
+ * @param set_function_handle The handle to be passed to set function.
+ * @return 0 if we are reaching the end and batch is not returned.
  */
 XGB_EXTERN_C typedef int XGBCallbackDataIterNext(  // NOLINT(*)
     DataIterHandle data_handle, XGBCallbackSetData *set_function,
     DataHolderHandle set_function_handle);
 
-/*!
- * \brief Create a DMatrix from a data iterator.
- * \param data_handle The handle to the data.
- * \param callback The callback to get the data.
- * \param cache_info Additional information about cache file, can be null.
- * \param missing Which value to represent missing value.
- * \param out The created DMatrix
- * \return 0 when success, -1 when failure happens.
+/**
+ * @brief Create a DMatrix from a data iterator.
+ * @param data_handle The handle to the data.
+ * @param callback The callback to get the data.
+ * @param cache_info Additional information about cache file, can be null.
+ * @param missing Which value to represent missing value.
+ * @param out The created DMatrix
+ * @return 0 when success, -1 when failure happens.
  */
 XGB_DLL int XGDMatrixCreateFromDataIter(
     DataIterHandle data_handle,
@@ -475,8 +480,8 @@ XGB_EXTERN_C typedef int XGDMatrixCallbackNext(DataIterHandle iter);  // NOLINT(
 XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLINT(*)
 
 
-/*!
- * \brief Create an external memory DMatrix with data iterator.
+/**
+ * @brief Create an external memory DMatrix with data iterator.
  *
  * Short note for how to use second set of callback for external memory data support:
  *
@@ -486,17 +491,17 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
  *           \ref XGDMatrixCreateFromCallback, along with other parameters encoded as a JSON object.
  * - Step 3: Call appropriate data setters in `next` functions.
  *
- * \param iter    A handle to external data iterator.
- * \param proxy   A DMatrix proxy handle created by \ref XGProxyDMatrixCreate.
- * \param reset   Callback function resetting the iterator state.
- * \param next    Callback function yielding the next batch of data.
- * \param config  JSON encoded parameters for DMatrix construction.  Accepted fields are:
+ * @param iter    A handle to external data iterator.
+ * @param proxy   A DMatrix proxy handle created by \ref XGProxyDMatrixCreate.
+ * @param reset   Callback function resetting the iterator state.
+ * @param next    Callback function yielding the next batch of data.
+ * @param config  JSON encoded parameters for DMatrix construction.  Accepted fields are:
  *   - missing:      Which value to represent missing value
  *   - cache_prefix: The path of cache file, caller must initialize all the directories in this path.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
- * \param[out] out      The created external memory DMatrix
+ * @param[out] out      The created external memory DMatrix
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
                                         DataIterResetCallback *reset, XGDMatrixCallbackNext *next,
@@ -550,6 +555,10 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
  *
  * @note This is experimental and subject to change.
  *
+ * @verbatim embed:rst:leading-asterisk
+ *    See :doc:`/tutorials/external_memory` for more info.
+ * @endverbatim
+ *
  * @param iter     A handle to external data iterator.
  * @param proxy    A DMatrix proxy handle created by @ref XGProxyDMatrixCreate.
  * @param ref      Reference DMatrix for providing quantile information.
@@ -627,16 +636,16 @@ XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, const char *
  */
 XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, char const *data);
 
-/*!
- * \brief Set data on a DMatrix proxy.
+/**
+ * @brief Set data on a DMatrix proxy.
  *
- * \param handle        A DMatrix proxy created by \ref XGProxyDMatrixCreate
- * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
- * \param indices       JSON encoded __array_interface__ to column indices in CSR.
- * \param data          JSON encoded __array_interface__ to values in CSR..
- * \param ncol          The number of columns of input CSR matrix.
+ * @param handle        A DMatrix proxy created by \ref XGProxyDMatrixCreate
+ * @param indptr        JSON encoded __array_interface__ to row pointer in CSR.
+ * @param indices       JSON encoded __array_interface__ to column indices in CSR.
+ * @param data          JSON encoded __array_interface__ to values in CSR..
+ * @param ncol          The number of columns of input CSR matrix.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
                                      char const *indices, char const *data,
@@ -644,35 +653,36 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
 
 /** @} */  // End of Streaming
 
-/*!
- * \brief create a new dmatrix from sliced content of existing matrix
- * \param handle instance of data matrix to be sliced
- * \param idxset index set
- * \param len length of index set
- * \param out a sliced new matrix
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief create a new dmatrix from sliced content of existing matrix
+ * @param handle instance of data matrix to be sliced
+ * @param idxset index set
+ * @param len length of index set
+ * @param out a sliced new matrix
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
                                   const int *idxset,
                                   bst_ulong len,
                                   DMatrixHandle *out);
-/*!
- * \brief create a new dmatrix from sliced content of existing matrix
- * \param handle instance of data matrix to be sliced
- * \param idxset index set
- * \param len length of index set
- * \param out a sliced new matrix
- * \param allow_groups allow slicing of an array with groups
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief create a new dmatrix from sliced content of existing matrix
+ * @param handle instance of data matrix to be sliced
+ * @param idxset index set
+ * @param len length of index set
+ * @param out a sliced new matrix
+ * @param allow_groups allow slicing of an array with groups
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixSliceDMatrixEx(DMatrixHandle handle,
                                     const int *idxset,
                                     bst_ulong len,
                                     DMatrixHandle *out,
                                     int allow_groups);
-/*!
- * \brief free space in data matrix
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief Free a DMatrix object.
+ *
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixFree(DMatrixHandle handle);
 /**
@@ -684,7 +694,7 @@ XGB_DLL int XGDMatrixFree(DMatrixHandle handle);
  *        DMatrix are not supported.
  *
  * @param handle a instance of data matrix
- * @param fname file name
+ * @param fname File name
  * @param silent print statistics when saving
  *
  * @return 0 when success, -1 when failure happens
@@ -704,13 +714,13 @@ XGB_DLL int XGDMatrixSaveBinary(DMatrixHandle handle,
 XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle, char const *field,
                                           char const *data);
 
-/*!
- * \brief set float vector to a content in info
- * \param handle a instance of data matrix
- * \param field field name, can be label, weight
- * \param array pointer to float vector
- * \param len length of array
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief set float vector to a content in info
+ * @param handle a instance of data matrix
+ * @param field field name, can be label, weight
+ * @param array pointer to float vector
+ * @param len length of array
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const float *array,
                                   bst_ulong len);
@@ -722,21 +732,21 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const
 XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const unsigned *array,
                                  bst_ulong len);
 
-/*!
- * \brief Set string encoded information of all features.
+/**
+ * @brief Set string encoded information of all features.
  *
  * Accepted fields are:
  *   - feature_name
  *   - feature_type
  *
- * \param handle    An instance of data matrix
- * \param field     Field name
- * \param features  Pointer to array of strings.
- * \param size      Size of `features` pointer (number of strings passed in).
+ * @param handle    An instance of data matrix
+ * @param field     Field name
+ * @param features  Pointer to array of strings.
+ * @param size      Size of `features` pointer (number of strings passed in).
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  *
- * \code
+ * @code{c}
  *
  *   char const* feat_names [] {"feat_0", "feat_1"};
  *   XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
@@ -746,14 +756,14 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const
  *   char const* feat_types [] {"i", "q"};
  *   XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
  *
- * \endcode
+ * @endcode
  */
 XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
                                        const char **features,
                                        const bst_ulong size);
 
-/*!
- * \brief Get string encoded information of all features.
+/**
+ * @brief Get string encoded information of all features.
  *
  * Accepted fields are:
  *   - feature_name
@@ -762,15 +772,15 @@ XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
  * Caller is responsible for copying out the data, before next call to any API function of
  * XGBoost.
  *
- * \param handle       An instance of data matrix
- * \param field        Field name
- * \param size         Size of output pointer `features` (number of strings returned).
- * \param out_features Address of a pointer to array of strings.  Result is stored in
+ * @param handle       An instance of data matrix
+ * @param field        Field name
+ * @param size         Size of output pointer `features` (number of strings returned).
+ * @param out_features Address of a pointer to array of strings.  Result is stored in
  *                     thread local memory.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  *
- * \code
+ * @code{c}
  *
  *  char const **c_out_features = NULL;
  *  bst_ulong out_size = 0;
@@ -785,7 +795,7 @@ XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
  *    printf("feature %lu: %s\n", i, c_out_features[i]);
  *  }
  *
- * \endcode
+ * @endcode
  */
 XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
                                        bst_ulong *size,
@@ -799,13 +809,13 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
 XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void const *data,
                                   bst_ulong size, int type);
 
-/*!
- * \brief get float info vector from matrix.
- * \param handle a instance of data matrix
- * \param field field name
- * \param out_len used to set result length
- * \param out_dptr pointer to the result
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief get float info vector from matrix.
+ * @param handle a instance of data matrix
+ * @param field field name
+ * @param out_len used to set result length
+ * @param out_dptr pointer to the result
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle, const char *field, bst_ulong *out_len,
                                   const float **out_dptr);
@@ -813,55 +823,57 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle, const char *field,
  * @example c-api-demo.c
  */
 
-/*!
- * \brief get uint32 info vector from matrix
- * \param handle a instance of data matrix
- * \param field field name
- * \param out_len The length of the field.
- * \param out_dptr pointer to the result
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief get uint32 info vector from matrix
+ * @param handle a instance of data matrix
+ * @param field field name
+ * @param out_len The length of the field.
+ * @param out_dptr pointer to the result
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
                                  const char *field,
                                  bst_ulong* out_len,
                                  const unsigned **out_dptr);
-/*!
- * \brief get number of rows.
- * \param handle the handle to the DMatrix
- * \param out The address to hold number of rows.
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief Get the number of rows from a DMatrix.
+ *
+ * @param handle the handle to the DMatrix
+ * @param out The address to hold number of rows.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, bst_ulong *out);
-/*!
- * \brief get number of columns
- * \param handle the handle to the DMatrix
- * \param out The output of number of columns
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief Get the number of columns from a DMatrix.
+ *
+ * @param handle the handle to the DMatrix
+ * @param out The output of number of columns
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, bst_ulong *out);
 
-/*!
- * \brief Get number of valid values from DMatrix.
+/**
+ * @brief Get number of valid values from a DMatrix.
  *
- * \param handle the handle to the DMatrix
- * \param out The output of number of non-missing values
+ * @param handle the handle to the DMatrix
+ * @param out The output of number of non-missing values
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
 
-/*!
- * \brief Get the data split mode from DMatrix.
+/**
+ * @brief Get the data split mode from DMatrix.
  *
- * \param handle the handle to the DMatrix
- * \param out The output of the data split mode
+ * @param handle the handle to the DMatrix
+ * @param out The output of the data split mode
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out);
 
 /**
- * \brief Get the predictors from DMatrix as CSR matrix for testing.  If this is a
+ * @brief Get the predictors from DMatrix as CSR matrix for testing.  If this is a
  *        quantized DMatrix, quantized values are returned instead.
  *
  * Unlike most of XGBoost C functions, caller of `XGDMatrixGetDataAsCSR` is required to
@@ -871,14 +883,14 @@ XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out);
  *
  * @since 1.7.0
  *
- * \param handle the handle to the DMatrix
- * \param config JSON configuration string. At the moment it should be an empty document,
+ * @param handle the handle to the DMatrix
+ * @param config JSON configuration string. At the moment it should be an empty document,
  *               preserved for future use.
- * \param out_indptr  indptr of output CSR matrix.
- * \param out_indices Column index of output CSR matrix.
- * \param out_data    Data value of CSR matrix.
+ * @param out_indptr  indptr of output CSR matrix.
+ * @param out_indices Column index of output CSR matrix.
+ * @param out_data    Data value of CSR matrix.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
                                   bst_ulong *out_indptr, unsigned *out_indices, float *out_data);
@@ -951,38 +963,38 @@ XGB_DLL int XGBoosterFree(BoosterHandle handle);
  */
 XGB_DLL int XGBoosterReset(BoosterHandle handle);
 
-/*!
- * \brief Slice a model using boosting index. The slice m:n indicates taking all trees
+/**
+ * @brief Slice a model using boosting index. The slice m:n indicates taking all trees
  *        that were fit during the boosting rounds m, (m+1), (m+2), ..., (n-1).
  *
- * \param handle Booster to be sliced.
- * \param begin_layer start of the slice
- * \param end_layer end of the slice; end_layer=0 is equivalent to
+ * @param handle Booster to be sliced.
+ * @param begin_layer start of the slice
+ * @param end_layer end of the slice; end_layer=0 is equivalent to
  *                  end_layer=num_boost_round
- * \param step step size of the slice
- * \param out Sliced booster.
+ * @param step step size of the slice
+ * @param out Sliced booster.
  *
- * \return 0 when success, -1 when failure happens, -2 when index is out of bound.
+ * @return 0 when success, -1 when failure happens, -2 when index is out of bound.
  */
 XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
                            int end_layer, int step,
                            BoosterHandle *out);
 
-/*!
- * \brief Get number of boosted rounds from gradient booster.  When process_type is
+/**
+ * @brief Get number of boosted rounds from gradient booster.  When process_type is
  *        update, this number might drop due to removed tree.
- * \param handle Handle to booster.
- * \param out Pointer to output integer.
- * \return 0 when success, -1 when failure happens
+ * @param handle Handle to booster.
+ * @param out Pointer to output integer.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterBoostedRounds(BoosterHandle handle, int* out);
 
-/*!
- * \brief set parameters
- * \param handle handle
- * \param name  parameter name
- * \param value value of parameter
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief set parameters
+ * @param handle handle
+ * @param name  parameter name
+ * @param value value of parameter
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSetParam(BoosterHandle handle,
                               const char *name,
@@ -991,23 +1003,23 @@ XGB_DLL int XGBoosterSetParam(BoosterHandle handle,
  * @example c-api-demo.c
  */
 
-/*!
- * \brief get number of features
- * \param handle Handle to booster.
- * \param out number of features
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief get number of features
+ * @param handle Handle to booster.
+ * @param out number of features
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterGetNumFeature(BoosterHandle handle, bst_ulong *out);
 /**
  * @example c-api-demo.c
  */
 
-/*!
- * \brief update the model in one round using dtrain
- * \param handle handle
- * \param iter current iteration rounds
- * \param dtrain training data
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief update the model in one round using dtrain
+ * @param handle handle
+ * @param iter current iteration rounds
+ * @param dtrain training data
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle, int iter, DMatrixHandle dtrain);
 /**
@@ -1038,15 +1050,15 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, fl
 XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, int iter,
                                   char const *grad, char const *hess);
 
-/*!
- * \brief get evaluation statistics for xgboost
- * \param handle handle
- * \param iter current iteration rounds
- * \param dmats pointers to data to be evaluated
- * \param evnames pointers to names of each data
- * \param len length of dmats
- * \param out_result the string containing evaluation statistics
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief get evaluation statistics for xgboost
+ * @param handle handle
+ * @param iter current iteration rounds
+ * @param dmats pointers to data to be evaluated
+ * @param evnames pointers to names of each data
+ * @param len length of dmats
+ * @param out_result the string containing evaluation statistics
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle dmats[],
                                  const char *evnames[], bst_ulong len, const char **out_result);
@@ -1063,21 +1075,21 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle d
  * @{
  */
 
-/*!
- * \brief make prediction based on dmat (deprecated, use \ref XGBoosterPredictFromDMatrix instead)
+/**
+ * @brief make prediction based on dmat (deprecated, use \ref XGBoosterPredictFromDMatrix instead)
  * \deprecated
  * \see XGBoosterPredictFromDMatrix()
  *
- * \param handle handle
- * \param dmat data matrix
- * \param option_mask bit-mask of options taken in prediction, possible values
+ * @param handle handle
+ * @param dmat data matrix
+ * @param option_mask bit-mask of options taken in prediction, possible values
  *          0:normal prediction
  *          1:output margin instead of transformed value
  *          2:output leaf index of trees instead of leaf value, note leaf index is unique per tree
  *          4:output feature contributions to individual predictions
- * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees
+ * @param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees
  *    when the parameter is set to 0, we will use all the trees
- * \param training Whether the prediction function is used as part of a training loop.
+ * @param training Whether the prediction function is used as part of a training loop.
  *    Prediction can be run in 2 scenarios:
  *    1. Given data matrix X, obtain prediction y_pred from the model.
  *    2. Obtain the prediction for computing gradients. For example, DART booster performs dropout
@@ -1085,9 +1097,9 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle d
  *       inference step due to dropped trees.
  *    Set training=false for the first scenario. Set training=true for the second scenario.
  *    The second scenario applies when you are defining a custom objective function.
- * \param out_len used to store length of returning result
- * \param out_result used to set a pointer to array
- * \return 0 when success, -1 when failure happens
+ * @param out_len used to store length of returning result
+ * @param out_result used to set a pointer to array
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterPredict(BoosterHandle handle,
                              DMatrixHandle dmat,
@@ -1097,12 +1109,12 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
                              bst_ulong *out_len,
                              const float **out_result);
 
-/*!
- * \brief Make prediction from DMatrix, replacing \ref XGBoosterPredict.
+/**
+ * @brief Make prediction from DMatrix, replacing \ref XGBoosterPredict.
  *
- * \param handle Booster handle
- * \param dmat   DMatrix handle
- * \param config String encoded predict configuration in JSON format, with following
+ * @param handle Booster handle
+ * @param dmat   DMatrix handle
+ * @param config String encoded predict configuration in JSON format, with following
  *                      available fields in the JSON object:
  *
  *    "type": [0, 6]
@@ -1137,21 +1149,21 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
  *
  *   Example JSON input for running a normal prediction with strict output shape, 2 dim
  *   for softprob , 1 dim for others.
- *   \code
+ *   @code{javascript}
  *      {
  *         "type": 0,
  *         "training": false,
  *         "iteration_begin": 0,
  *         "iteration_end": 0,
  *         "strict_shape": true
- *     }
- *   \endcode
+ *      }
+ *   @endcode
  *
- * \param out_shape Shape of output prediction (copy before use).
- * \param out_dim   Dimension of output prediction.
- * \param out_result Buffer storing prediction value (copy before use).
+ * @param out_shape Shape of output prediction (copy before use).
+ * @param out_dim   Dimension of output prediction.
+ * @param out_result Buffer storing prediction value (copy before use).
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  *
  * @see XGBoosterPredictFromDense XGBoosterPredictFromCSR XGBoosterPredictFromCudaArray XGBoosterPredictFromCudaColumnar
  */
@@ -1163,24 +1175,24 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat
  */
 
 /**
- * \brief Inplace prediction from CPU dense matrix.
+ * @brief Inplace prediction from CPU dense matrix.
  *
  * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
  *       prediction with DMatrix with a performance warning.
  *
- * \param handle        Booster handle.
- * \param values        JSON encoded __array_interface__ to values.
- * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param handle        Booster handle.
+ * @param values        JSON encoded __array_interface__ to values.
+ * @param config        See \ref XGBoosterPredictFromDMatrix for more info.
  *   Additional fields for inplace prediction are:
  *     - "missing": float
- * \param m             An optional (NULL if not available) proxy DMatrix instance
+ * @param m             An optional (NULL if not available) proxy DMatrix instance
  *                      storing meta info.
  *
- * \param out_shape     See \ref XGBoosterPredictFromDMatrix for more info.
- * \param out_dim       See \ref XGBoosterPredictFromDMatrix for more info.
- * \param out_result    See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_shape     See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_dim       See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_result    See \ref XGBoosterPredictFromDMatrix for more info.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values, char const *config,
                                       DMatrixHandle m, bst_ulong const **out_shape,
@@ -1215,27 +1227,27 @@ XGB_DLL int XGBoosterPredictFromColumnar(BoosterHandle handle, char const *value
                                          const float **out_result);
 
 /**
- * \brief Inplace prediction from CPU CSR matrix.
+ * @brief Inplace prediction from CPU CSR matrix.
  *
  * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
  *       prediction with DMatrix with a performance warning.
  *
- * \param handle        Booster handle.
- * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
- * \param indices       JSON encoded __array_interface__ to column indices in CSR.
- * \param values        JSON encoded __array_interface__ to values in CSR..
- * \param ncol          Number of features in data.
- * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param handle        Booster handle.
+ * @param indptr        JSON encoded __array_interface__ to row pointer in CSR.
+ * @param indices       JSON encoded __array_interface__ to column indices in CSR.
+ * @param values        JSON encoded __array_interface__ to values in CSR..
+ * @param ncol          Number of features in data.
+ * @param config        See \ref XGBoosterPredictFromDMatrix for more info.
  *   Additional fields for inplace prediction are:
  *     - "missing": float
- * \param m             An optional (NULL if not available) proxy DMatrix instance
+ * @param m             An optional (NULL if not available) proxy DMatrix instance
  *                      storing meta info.
  *
- * \param out_shape     See \ref XGBoosterPredictFromDMatrix for more info.
- * \param out_dim       See \ref XGBoosterPredictFromDMatrix for more info.
- * \param out_result    See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_shape     See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_dim       See \ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_result    See \ref XGBoosterPredictFromDMatrix for more info.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, char const *indices,
                                     char const *values, bst_ulong ncol, char const *config,
@@ -1309,11 +1321,10 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *d
  *
  * - Functions with the term "Config" handles save/loading configuration.  It helps user
  *   to study the internal of XGBoost.  Also user can use the load method for specifying
- *   parameters in a structured way.  These functions are introduced in 1.0.0, and are not
- *   yet stable.
+ *   parameters in a structured way.  These functions were introduced in 1.0.0.
  *
- * - Functions with the term "Serialization" are combined of above two.  They are used in
- *   situations like check-pointing, or continuing training task in distributed
+ * - Functions with the term "Serialization" are combination of above two.  They are used
+ *   in situations like check-pointing, or continuing training task in a distributed
  *   environment.  In these cases the task must be carried out without any user
  *   intervention.
  *
@@ -1340,94 +1351,94 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
  */
 XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
                                const char *fname);
-/*!
- * \brief load model from in memory buffer
+/**
+ * @brief load model from in memory buffer
  *
- * \param handle handle
- * \param buf pointer to the buffer
- * \param len the length of the buffer
- * \return 0 when success, -1 when failure happens
+ * @param handle handle
+ * @param buf pointer to the buffer
+ * @param len the length of the buffer
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
                                          const void *buf,
                                          bst_ulong len);
 
-/*!
- * \brief Save model into raw bytes, return header of the array.  User must copy the
+/**
+ * @brief Save model into raw bytes, return header of the array.  User must copy the
  *        result out, before next xgboost call
  *
- * \param handle handle
- * \param config JSON encoded string storing parameters for the function.  Following
+ * @param handle handle
+ * @param config JSON encoded string storing parameters for the function.  Following
  *               keys are expected in the JSON document:
  *               - "format": str
  *                 - json: Output booster will be encoded as JSON.
  *                 - ubj:  Output booster will be encoded as Universal binary JSON.
  *                   this format except for compatibility reasons.
- * \param out_len  The argument to hold the output length
- * \param out_dptr The argument to hold the output data pointer
+ * @param out_len  The argument to hold the output length
+ * @param out_dptr The argument to hold the output data pointer
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *config, bst_ulong *out_len,
                                        char const **out_dptr);
 
-/*!
- * \brief Memory snapshot based serialization method.  Saves everything states
+/**
+ * @brief Memory snapshot based serialization method.  Saves everything states
  * into buffer.
  *
- * \param handle handle
- * \param out_len the argument to hold the output length
- * \param out_dptr the argument to hold the output data pointer
- * \return 0 when success, -1 when failure happens
+ * @param handle handle
+ * @param out_len the argument to hold the output length
+ * @param out_dptr the argument to hold the output data pointer
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, bst_ulong *out_len,
                                        const char **out_dptr);
-/*!
- * \brief Memory snapshot based serialization method.  Loads the buffer returned
+/**
+ * @brief Memory snapshot based serialization method.  Loads the buffer returned
  *        from \ref XGBoosterSerializeToBuffer.
  *
- * \param handle handle
- * \param buf pointer to the buffer
- * \param len the length of the buffer
- * \return 0 when success, -1 when failure happens
+ * @param handle handle
+ * @param buf pointer to the buffer
+ * @param len the length of the buffer
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
                                            const void *buf, bst_ulong len);
 
-/*!
- * \brief Save XGBoost's internal configuration into a JSON document.  Currently the
+/**
+ * @brief Save XGBoost's internal configuration into a JSON document.  Currently the
  *        support is experimental, function signature may change in the future without
  *        notice.
  *
- * \param handle handle to Booster object.
- * \param out_len length of output string
- * \param out_str A valid pointer to array of characters.  The characters array is
+ * @param handle handle to Booster object.
+ * @param out_len length of output string
+ * @param out_str A valid pointer to array of characters.  The characters array is
  *                allocated and managed by XGBoost, while pointer to that array needs to
  *                be managed by caller.
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSaveJsonConfig(BoosterHandle handle, bst_ulong *out_len,
                                     char const **out_str);
-/*!
- * \brief Load XGBoost's internal configuration from a JSON document.  Currently the
+/**
+ * @brief Load XGBoost's internal configuration from a JSON document.  Currently the
  *        support is experimental, function signature may change in the future without
  *        notice.
  *
- * \param handle handle to Booster object.
- * \param config string representation of a JSON document.
- * \return 0 when success, -1 when failure happens
+ * @param handle handle to Booster object.
+ * @param config string representation of a JSON document.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterLoadJsonConfig(BoosterHandle handle, char const *config);
 /**@}*/  // End of Serialization
 
-/*!
- * \brief dump model, return array of strings representing model dump
- * \param handle handle
- * \param fmap  name to fmap can be empty string
- * \param with_stats whether to dump with statistics
- * \param out_len length of output array
- * \param out_dump_array pointer to hold representing dump of each model
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief dump model, return array of strings representing model dump
+ * @param handle handle
+ * @param fmap  name to fmap can be empty string
+ * @param with_stats whether to dump with statistics
+ * @param out_len length of output array
+ * @param out_dump_array pointer to hold representing dump of each model
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterDumpModel(BoosterHandle handle,
                                const char *fmap,
@@ -1435,15 +1446,15 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle,
                                bst_ulong *out_len,
                                const char ***out_dump_array);
 
-/*!
- * \brief dump model, return array of strings representing model dump
- * \param handle handle
- * \param fmap  name to fmap can be empty string
- * \param with_stats whether to dump with statistics
- * \param format the format to dump the model in
- * \param out_len length of output array
- * \param out_dump_array pointer to hold representing dump of each model
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief dump model, return array of strings representing model dump
+ * @param handle handle
+ * @param fmap  name to fmap can be empty string
+ * @param with_stats whether to dump with statistics
+ * @param format the format to dump the model in
+ * @param out_len length of output array
+ * @param out_dump_array pointer to hold representing dump of each model
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterDumpModelEx(BoosterHandle handle,
                                  const char *fmap,
@@ -1452,16 +1463,16 @@ XGB_DLL int XGBoosterDumpModelEx(BoosterHandle handle,
                                  bst_ulong *out_len,
                                  const char ***out_dump_array);
 
-/*!
- * \brief dump model, return array of strings representing model dump
- * \param handle handle
- * \param fnum number of features
- * \param fname names of features
- * \param ftype types of features
- * \param with_stats whether to dump with statistics
- * \param out_len length of output array
- * \param out_models pointer to hold representing dump of each model
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief dump model, return array of strings representing model dump
+ * @param handle handle
+ * @param fnum number of features
+ * @param fname names of features
+ * @param ftype types of features
+ * @param with_stats whether to dump with statistics
+ * @param out_len length of output array
+ * @param out_models pointer to hold representing dump of each model
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
                                            int fnum,
@@ -1471,17 +1482,17 @@ XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
                                            bst_ulong *out_len,
                                            const char ***out_models);
 
-/*!
- * \brief dump model, return array of strings representing model dump
- * \param handle handle
- * \param fnum number of features
- * \param fname names of features
- * \param ftype types of features
- * \param with_stats whether to dump with statistics
- * \param format the format to dump the model in
- * \param out_len length of output array
- * \param out_models pointer to hold representing dump of each model
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief dump model, return array of strings representing model dump
+ * @param handle handle
+ * @param fnum number of features
+ * @param fname names of features
+ * @param ftype types of features
+ * @param with_stats whether to dump with statistics
+ * @param format the format to dump the model in
+ * @param out_len length of output array
+ * @param out_models pointer to hold representing dump of each model
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle,
                                              int fnum,
@@ -1492,91 +1503,91 @@ XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle,
                                              bst_ulong *out_len,
                                              const char ***out_models);
 
-/*!
- * \brief Get string attribute from Booster.
- * \param handle handle
- * \param key The key of the attribute.
- * \param out The result attribute, can be NULL if the attribute do not exist.
- * \param success Whether the result is contained in out.
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief Get string attribute from Booster.
+ * @param handle handle
+ * @param key The key of the attribute.
+ * @param out The result attribute, can be NULL if the attribute do not exist.
+ * @param success Whether the result is contained in out.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterGetAttr(BoosterHandle handle,
                              const char* key,
                              const char** out,
                              int *success);
-/*!
- * \brief Set or delete string attribute.
+/**
+ * @brief Set or delete string attribute.
  *
- * \param handle handle
- * \param key The key of the attribute.
- * \param value The value to be saved.
+ * @param handle handle
+ * @param key The key of the attribute.
+ * @param value The value to be saved.
  *              If nullptr, the attribute would be deleted.
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
                              const char* key,
                              const char* value);
-/*!
- * \brief Get the names of all attribute from Booster.
- * \param handle handle
- * \param out_len the argument to hold the output length
- * \param out pointer to hold the output attribute stings
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief Get the names of all attribute from Booster.
+ * @param handle handle
+ * @param out_len the argument to hold the output length
+ * @param out pointer to hold the output attribute stings
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
                                   bst_ulong* out_len,
                                   const char*** out);
 
-/*!
- * \brief Set string encoded feature info in Booster, similar to the feature
+/**
+ * @brief Set string encoded feature info in Booster, similar to the feature
  *        info in DMatrix.
  *
  * Accepted fields are:
  *   - feature_name
  *   - feature_type
  *
- * \param handle    An instance of Booster
- * \param field     Field name
- * \param features  Pointer to array of strings.
- * \param size      Size of `features` pointer (number of strings passed in).
+ * @param handle    An instance of Booster
+ * @param field     Field name
+ * @param features  Pointer to array of strings.
+ * @param size      Size of `features` pointer (number of strings passed in).
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
                                        const char **features,
                                        const bst_ulong size);
 
-/*!
- * \brief Get string encoded feature info from Booster, similar to feature info
+/**
+ * @brief Get string encoded feature info from Booster, similar to the feature info
  *        in DMatrix.
  *
- * Accepted fields are:
+ * Accepted field names are:
  *   - feature_name
  *   - feature_type
  *
- * Caller is responsible for copying out the data, before next call to any API
+ * Caller is responsible for copying out the data, before the next call to any API
  * function of XGBoost.
  *
- * \param handle       An instance of Booster
- * \param field        Field name
- * \param len          Size of output pointer `features` (number of strings returned).
- * \param out_features Address of a pointer to array of strings. Result is stored in
+ * @param handle       An instance of Booster
+ * @param field        Field name
+ * @param len          Size of output pointer `features` (number of strings returned).
+ * @param out_features Address of a pointer to array of strings. Result is stored in
  *        thread local memory.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
                                        bst_ulong *len,
                                        const char ***out_features);
 
-/*!
- * \brief Calculate feature scores for tree models.  When used on linear model, only the
+/**
+ * @brief Calculate feature scores for tree models.  When used on linear model, only the
  * `weight` importance type is defined, and output scores is a row major matrix with shape
  * [n_features, n_classes] for multi-class model.  For tree model, out_n_feature is always
  * equal to out_n_scores and has multiple definitions of importance type.
  *
- * \param handle          An instance of Booster
- * \param config          Parameters for computing scores encoded as JSON.  Accepted JSON keys are:
+ * @param handle          An instance of Booster
+ * @param config          Parameters for computing scores encoded as JSON.  Accepted JSON keys are:
  *   - importance_type: A JSON string with following possible values:
  *       * 'weight': the number of times a feature is used to split the data across all trees.
  *       * 'gain': the average gain across all splits the feature is used in.
@@ -1586,13 +1597,13 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
  *   - feature_map: An optional JSON string with URI or path to the feature map file.
  *   - feature_names: An optional JSON array with string names for each feature.
  *
- * \param out_n_features  Length of output feature names.
- * \param out_features    An array of string as feature names, ordered the same as output scores.
- * \param out_dim         Dimension of output feature scores.
- * \param out_shape       Shape of output feature scores with length of `out_dim`.
- * \param out_scores      An array of floating point as feature scores with shape of `out_shape`.
+ * @param out_n_features  Length of output feature names.
+ * @param out_features    An array of string as feature names, ordered the same as output scores.
+ * @param out_dim         Dimension of output feature scores.
+ * @param out_shape       Shape of output feature scores with length of `out_dim`.
+ * @param out_scores      An array of floating point as feature scores with shape of `out_shape`.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
                                   bst_ulong *out_n_features, char const ***out_features,
@@ -1667,7 +1678,7 @@ typedef void *TrackerHandle; /* NOLINT */
  *
  * @param handle The handle to the created tracker.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle);
 
@@ -1678,7 +1689,7 @@ XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle);
  * @param handle The handle to the tracker.
  * @param args The arguments returned as a JSON document.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args);
 
@@ -1689,7 +1700,7 @@ XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args);
  * @param handle The handle to the tracker.
  * @param config Unused at the moment, preserved for the future.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGTrackerRun(TrackerHandle handle, char const *config);
 
@@ -1701,7 +1712,7 @@ XGB_DLL int XGTrackerRun(TrackerHandle handle, char const *config);
  * @param config JSON encoded configuration. No argument is required yet, preserved for
  *        the future.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGTrackerWaitFor(TrackerHandle handle, char const *config);
 
@@ -1712,7 +1723,7 @@ XGB_DLL int XGTrackerWaitFor(TrackerHandle handle, char const *config);
  *
  * @param handle The handle to the tracker.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGTrackerFree(TrackerHandle handle);
 
@@ -1748,7 +1759,7 @@ XGB_DLL int XGTrackerFree(TrackerHandle handle);
  *   - federated_client_key_path: Client key file path. Only needed for the SSL mode.
  *   - federated_client_cert_path: Client certificate file path. Only needed for the SSL mode.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGCommunicatorInit(char const* config);
 
@@ -1757,7 +1768,7 @@ XGB_DLL int XGCommunicatorInit(char const* config);
  *
  * Call this function after you have finished all jobs.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGCommunicatorFinalize(void);
 
@@ -1789,7 +1800,7 @@ XGB_DLL int XGCommunicatorIsDistributed(void);
  * who monitors the tracker.
  *
  * @param message The message to be printed.
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGCommunicatorPrint(char const *message);
 
@@ -1797,7 +1808,7 @@ XGB_DLL int XGCommunicatorPrint(char const *message);
  * @brief Get the name of the processor.
  *
  * @param name_str Pointer to received returned processor name.
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGCommunicatorGetProcessorName(const char** name_str);
 
@@ -1814,7 +1825,8 @@ XGB_DLL int XGCommunicatorGetProcessorName(const char** name_str);
  * @param send_receive_buffer Pointer to the send or receive buffer.
  * @param size Size of the data in bytes.
  * @param root The process rank to broadcast from.
- * @return 0 for success, -1 for failure.
+ *
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGCommunicatorBroadcast(void *send_receive_buffer, size_t size, int root);
 
@@ -1837,7 +1849,7 @@ XGB_DLL int XGCommunicatorBroadcast(void *send_receive_buffer, size_t size, int
  * @param data_type Enumeration of data type, see xgboost::collective::DataType in communicator.h.
  * @param op Enumeration of operation type, see xgboost::collective::Operation in communicator.h.
  *
- * @return 0 for success, -1 for failure.
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int data_type, int op);
 

From 194c1b9db3c04c22f140bfe17fd78215bef03d35 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 28 Oct 2025 08:44:34 +0800
Subject: [PATCH 209/224] Small cleanup for the dart booster. (#11777)

---
 src/gbm/gbtree.cc | 35 ++++++++++++-----------------------
 src/gbm/gbtree.h  | 25 +++++++++++++++----------
 2 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index f8e4ad7caa02..53b7e664eb3c 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -731,9 +731,8 @@ class Dart : public GBTree {
   }
 
   // An independent const function to make sure it's thread safe.
-  void PredictBatchImpl(DMatrix *p_fmat, PredictionCacheEntry *p_out_preds,
-                        bool training, unsigned layer_begin,
-                        unsigned layer_end) const {
+  void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* p_out_preds, bool training,
+                        bst_layer_t layer_begin, bst_layer_t layer_end) const {
     CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
     auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
     CHECK(predictor);
@@ -766,20 +765,19 @@ class Dart : public GBTree {
 
       // Multiple the weight to output prediction.
       auto w = this->weight_drop_.at(i);
-      auto group = h_tree_info.at(i);
+      auto grp_idx = h_tree_info.at(i);
       CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
       if (predts.predictions.Device().IsCUDA()) {
         p_out_preds->predictions.SetDevice(predts.predictions.Device());
-        GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
-                          predts.predictions.DeviceSpan(), w, n_rows, n_groups,
-                          group);
+        GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(), predts.predictions.DeviceSpan(), w,
+                          n_rows, n_groups, grp_idx);
       } else {
         auto &h_out_predts = p_out_preds->predictions.HostVector();
-        auto &h_predts = predts.predictions.HostVector();
+        auto &h_predts = predts.predictions.ConstHostVector();
         common::ParallelFor(p_fmat->Info().num_row_, ctx_->Threads(), [&](auto ridx) {
-          const size_t offset = ridx * n_groups + group;
+          const size_t offset = ridx * n_groups + grp_idx;
           h_out_predts[offset] += (h_predts[offset] * w);
         });
       }
@@ -904,7 +902,7 @@ class Dart : public GBTree {
   }
 
   // Select which trees to drop.
-  inline void DropTrees(bool is_training) {
+  void DropTrees(bool is_training) {
     if (!is_training) {
       // This function should be thread safe when it's not training.
       return;
@@ -914,10 +912,12 @@ class Dart : public GBTree {
     std::uniform_real_distribution<> runif(0.0, 1.0);
     auto& rnd = common::GlobalRandom();
     bool skip = false;
-    if (dparam_.skip_drop > 0.0) skip = (runif(rnd) < dparam_.skip_drop);
+    if (dparam_.skip_drop > 0.0) {
+      skip = (runif(rnd) < dparam_.skip_drop);
+    }
     // sample some trees to drop
     if (!skip) {
-      if (dparam_.sample_type == 1) {
+      if (dparam_.sample_type == DartSampleType::kWeighted) {
         bst_float sum_weight = 0.0;
         for (auto elem : weight_drop_) {
           sum_weight += elem;
@@ -987,17 +987,6 @@ class Dart : public GBTree {
     return num_drop;
   }
 
-  // init thread buffers
-  inline void InitThreadTemp(int nthread) {
-    int prev_thread_temp_size = thread_temp_.size();
-    if (prev_thread_temp_size < nthread) {
-      thread_temp_.resize(nthread, RegTree::FVec());
-      for (int i = prev_thread_temp_size; i < nthread; ++i) {
-        thread_temp_[i].Init(model_.learner_model_param->num_feature);
-      }
-    }
-  }
-
   // --- data structure ---
   // training parameter
   DartTrainParam dparam_;
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index ab11b4019874..46b54ce2c3f2 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -44,10 +44,17 @@ enum class TreeProcessType : int {
   kDefault = 0,
   kUpdate = 1
 };
+
+// Sampling type for dart weights.
+enum class DartSampleType : std::int32_t {
+  kUniform = 0,
+  kWeighted = 1,
+};
 }  // namespace xgboost
 
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
+DECLARE_FIELD_ENUM_CLASS(xgboost::DartSampleType);
 
 namespace xgboost::gbm {
 /*! \brief training parameters */
@@ -78,10 +85,9 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
   }
 };
 
-/*! \brief training parameters */
+/** @brief Dart training parameters */
 struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
-  /*! \brief type of sampling algorithm */
-  int sample_type;
+  DartSampleType sample_type;
   /*! \brief type of normalization algorithm */
   int normalize_type;
   /*! \brief fraction of trees to drop during the dropout */
@@ -90,12 +96,12 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
   bool one_drop;
   /*! \brief probability of skipping the dropout during an iteration */
   float skip_drop;
-  // declare parameters
+
   DMLC_DECLARE_PARAMETER(DartTrainParam) {
     DMLC_DECLARE_FIELD(sample_type)
-        .set_default(0)
-        .add_enum("uniform", 0)
-        .add_enum("weighted", 1)
+        .set_default(DartSampleType::kUniform)
+        .add_enum("uniform", DartSampleType::kUniform)
+        .add_enum("weighted", DartSampleType::kWeighted)
         .describe("Different types of sampling algorithm.");
     DMLC_DECLARE_FIELD(normalize_type)
         .set_default(0)
@@ -106,9 +112,8 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
         .set_range(0.0f, 1.0f)
         .set_default(0.0f)
         .describe("Fraction of trees to drop during the dropout.");
-    DMLC_DECLARE_FIELD(one_drop)
-        .set_default(false)
-        .describe("Whether at least one tree should always be dropped during the dropout.");
+    DMLC_DECLARE_FIELD(one_drop).set_default(false).describe(
+        "Whether at least one tree should always be dropped during the dropout.");
     DMLC_DECLARE_FIELD(skip_drop)
         .set_range(0.0f, 1.0f)
         .set_default(0.0f)

From a22acdc482efc38e7653c49a4562f219f6a7944c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 28 Oct 2025 08:50:47 +0800
Subject: [PATCH 210/224] Small cleanup for GPU split evaluation. (#11778)

---
 cmake/Utils.cmake                             |  4 +-
 src/CMakeLists.txt                            |  2 +-
 src/tree/gpu_hist/evaluate_splits.cu          | 25 +++++------
 src/tree/gpu_hist/evaluate_splits.cuh         | 12 +++---
 src/tree/gpu_hist/expand_entry.cuh            | 12 +++---
 src/tree/gpu_hist/histogram.cuh               |  2 +-
 src/tree/param.h                              | 18 ++++----
 src/tree/updater_gpu_hist.cu                  | 42 +++++++++----------
 tests/cpp/tree/gpu_hist/test_driver.cu        | 15 ++++---
 tests/cpp/tree/gpu_hist/test_histogram.cu     |  6 +--
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  2 +-
 11 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 50a464f053a1..6f785dd3ab4c 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -118,7 +118,9 @@ function(xgboost_set_cuda_flags target)
 
   if(USE_NVTX)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
-    target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
+    if(NOT USE_DEVICE_DEBUG)
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
+    endif()
   endif()
 
   # Use CCCL we find before CUDA Toolkit to make sure we get newer headers as intended
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6ca2a2a12730..4f7b5622b9e1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,7 +18,7 @@ set_source_files_properties(
   PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
 
 if(USE_CUDA)
-  file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
+  file(GLOB_RECURSE CUDA_SOURCES *.cu)
   target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
 endif()
 
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index d4ea5b88d5a1..5b2f1468a991 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -30,7 +30,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
       quantiser.ToFloatingPoint(parent_sum - scan));
 
   missing_left_out = missing_left_gain > missing_right_gain;
-  return missing_left_out?missing_left_gain:missing_right_gain;
+  return missing_left_out ? missing_left_gain : missing_right_gain;
 }
 
 // This kernel uses block_size == warp_size. This is an unusually small block size for a cuda kernel
@@ -92,8 +92,7 @@ class EvaluateSplitAgent {
   }
   __device__ GradientPairInt64 ReduceFeature() {
     GradientPairInt64 local_sum;
-    for (int idx = gidx_begin + threadIdx.x; idx < gidx_end;
-         idx += kBlockSize) {
+    for (int idx = gidx_begin + threadIdx.x; idx < gidx_end; idx += kBlockSize) {
       local_sum += LoadGpair(node_histogram + idx);
     }
     local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);  // NOLINT
@@ -103,7 +102,7 @@ class EvaluateSplitAgent {
   }
 
   // Load using efficient 128 vector load instruction
-  __device__ __forceinline__ GradientPairInt64 LoadGpair(const GradientPairInt64 *ptr) {
+  __device__ __forceinline__ static GradientPairInt64 LoadGpair(const GradientPairInt64 *ptr) {
     float4 tmp = *reinterpret_cast<const float4 *>(ptr);
     auto gpair = *reinterpret_cast<const GradientPairInt64 *>(&tmp);
     static_assert(sizeof(decltype(gpair)) == sizeof(float4),
@@ -111,8 +110,8 @@ class EvaluateSplitAgent {
     return gpair;
   }
 
-  __device__ __forceinline__ void Numerical(DeviceSplitCandidate * best_split) {
-    for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
+  __device__ __forceinline__ void Numerical(DeviceSplitCandidate *best_split) {
+    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
       GradientPairInt64 bin = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x)
                                             : GradientPairInt64();
@@ -255,12 +254,10 @@ class EvaluateSplitAgent {
   }
 };
 
-template <int kBlockSize>
-__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
-    bst_feature_t max_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs,
-    const EvaluateSplitSharedInputs shared_inputs,
-    common::Span<bst_feature_t> sorted_idx,
+template <int kBlockThreads>
+__global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
+    bst_feature_t max_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
+    const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
     const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
     common::Span<DeviceSplitCandidate> out_candidates) {
   // Aligned && shared storage for best_split
@@ -268,7 +265,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
   DeviceSplitCandidate &best_split = uninitialized_split.Alias();
 
   if (threadIdx.x == 0) {
-    best_split = DeviceSplitCandidate();
+    best_split = DeviceSplitCandidate{};
   }
 
   __syncthreads();
@@ -284,7 +281,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
   }
   int fidx = inputs.feature_set[feature_offset];
 
-  using AgentT = EvaluateSplitAgent<kBlockSize>;
+  using AgentT = EvaluateSplitAgent<kBlockThreads>;
   __shared__ typename AgentT::TempStorage temp_storage;
   AgentT agent(&temp_storage, fidx, inputs, shared_inputs, evaluator);
 
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 63c01d5361af..ac1ee0b2be2a 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -21,8 +21,8 @@ namespace tree {
 
 // Inputs specific to each node
 struct EvaluateSplitInputs {
-  int nidx;
-  int depth;
+  bst_node_t nidx;
+  bst_node_t depth;
   GradientPairInt64 parent_sum;
   common::Span<const bst_feature_t> feature_set;
   common::Span<const GradientPairInt64> gradient_histogram;
@@ -168,10 +168,10 @@ class GPUHistEvaluator {
   void ApplyTreeSplit(GPUExpandEntry const &candidate, RegTree *p_tree) {
     auto &tree = *p_tree;
     // Set up child constraints
-    auto left_child = tree[candidate.nid].LeftChild();
-    auto right_child = tree[candidate.nid].RightChild();
-    tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
-                             tree[candidate.nid].SplitIndex(), candidate.left_weight,
+    auto left_child = tree[candidate.nidx].LeftChild();
+    auto right_child = tree[candidate.nidx].RightChild();
+    tree_evaluator_.AddSplit(candidate.nidx, left_child, right_child,
+                             tree[candidate.nidx].SplitIndex(), candidate.left_weight,
                              candidate.right_weight);
   }
 
diff --git a/src/tree/gpu_hist/expand_entry.cuh b/src/tree/gpu_hist/expand_entry.cuh
index b4dc41da2b83..d38a4c2cfcf8 100644
--- a/src/tree/gpu_hist/expand_entry.cuh
+++ b/src/tree/gpu_hist/expand_entry.cuh
@@ -13,7 +13,7 @@
 
 namespace xgboost::tree {
 struct GPUExpandEntry {
-  bst_node_t nid;
+  bst_node_t nidx;
   bst_node_t depth;
   DeviceSplitCandidate split;
 
@@ -24,7 +24,7 @@ struct GPUExpandEntry {
   GPUExpandEntry() = default;
   XGBOOST_DEVICE GPUExpandEntry(bst_node_t nid, bst_node_t depth, DeviceSplitCandidate split,
                                 float base, float left, float right)
-      : nid(nid),
+      : nidx(nid),
         depth(depth),
         split(std::move(split)),
         base_weight{base},
@@ -49,13 +49,13 @@ struct GPUExpandEntry {
 
   [[nodiscard]] float GetLossChange() const { return split.loss_chg; }
 
-  [[nodiscard]] bst_node_t GetNodeId() const { return nid; }
+  [[nodiscard]] bst_node_t GetNodeId() const { return nidx; }
 
   [[nodiscard]] bst_node_t GetDepth() const { return depth; }
 
   friend std::ostream& operator<<(std::ostream& os, const GPUExpandEntry& e) {
     os << "GPUExpandEntry: \n";
-    os << "nidx: " << e.nid << "\n";
+    os << "nidx: " << e.nidx << "\n";
     os << "depth: " << e.depth << "\n";
     os << "loss: " << e.split.loss_chg << "\n";
     os << "left_sum: " << e.split.left_sum << "\n";
@@ -66,7 +66,7 @@ struct GPUExpandEntry {
   void Save(Json* p_out) const {
     auto& out = *p_out;
 
-    out["nid"] = Integer{this->nid};
+    out["nid"] = Integer{this->nidx};
     out["depth"] = Integer{this->depth};
     // GPU specific
     out["base_weight"] = this->base_weight;
@@ -99,7 +99,7 @@ struct GPUExpandEntry {
   }
 
   void Load(Json const& in) {
-    this->nid = get<Integer const>(in["nid"]);
+    this->nidx = get<Integer const>(in["nid"]);
     this->depth = get<Integer const>(in["depth"]);
     // GPU specific
     this->base_weight = get<Number const>(in["base_weight"]);
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 896b21633dc2..99440860f66a 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -196,7 +196,7 @@ class DeviceHistogramBuilder {
     for (std::size_t i = 0; i < subtraction_nidx.size(); i++) {
       auto build_hist_nidx = build_nidx.at(i);
       auto subtraction_trick_nidx = subtraction_nidx.at(i);
-      auto parent_nidx = candidates.at(i).nid;
+      auto parent_nidx = candidates.at(i).nidx;
 
       if (!this->SubtractionTrick(ctx, parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
         need_build.push_back(subtraction_trick_nidx);
diff --git a/src/tree/param.h b/src/tree/param.h
index fab68f0c298e..4431f85ad314 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file param.h
  * \brief training parameters, statistics used to support tree construction.
  * \author Tianqi Chen
@@ -242,8 +242,8 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad,
 
 // calculate weight given the statistics
 template <typename TrainingParams, typename T>
-XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
-                                   T sum_hess) {
+XGBOOST_DEVICE std::enable_if_t<std::is_floating_point_v<T>, T> CalcWeight(TrainingParams const &p,
+                                                                           T sum_grad, T sum_hess) {
   if (sum_hess < p.min_child_weight || sum_hess <= 0.0) {
     return 0.0;
   }
@@ -291,17 +291,17 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad)
 }
 
 /**
- * \brief multi-target weight, calculated with learning rate.
+ * @brief multi-target weight, calculated with learning rate.
  */
 inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
                        float eta, linalg::VectorView<float> out_w) {
-  for (bst_target_t i = 0; i < out_w.Size(); ++i) {
-    out_w(i) = CalcWeight(p, grad_sum(i).GetGrad(), grad_sum(i).GetHess()) * eta;
+  for (bst_target_t t = 0, n_targets = out_w.Size(); t < n_targets; ++t) {
+    out_w(t) = CalcWeight(p, grad_sum(t).GetGrad(), grad_sum(t).GetHess()) * eta;
   }
 }
 
 /**
- * \brief multi-target weight
+ * @brief multi-target weight
  */
 inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
                        linalg::VectorView<float> out_w) {
@@ -312,8 +312,8 @@ inline double CalcGainGivenWeight(TrainParam const &p,
                                   linalg::VectorView<GradientPairPrecise const> sum_grad,
                                   linalg::VectorView<float const> weight) {
   double gain{0};
-  for (bst_target_t i = 0; i < weight.Size(); ++i) {
-    gain += -weight(i) * ThresholdL1(sum_grad(i).GetGrad(), p.reg_alpha);
+  for (bst_target_t t = 0, n_targets = weight.Size(); t < n_targets; ++t) {
+    gain += -weight(t) * ThresholdL1(sum_grad(t).GetGrad(), p.reg_alpha);
   }
   return gain;
 }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 883b4b6cf78c..4fc5ec866c3e 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -85,11 +85,11 @@ void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
     auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
     bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
     if (fewer_right) {
-      p_build_nidx[nidx_in_set] = tree.RightChild(e.nid);
-      p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nid);
+      p_build_nidx[nidx_in_set] = tree.RightChild(e.nidx);
+      p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
     } else {
-      p_build_nidx[nidx_in_set] = tree.LeftChild(e.nid);
-      p_sub_nidx[nidx_in_set] = tree.RightChild(e.nid);
+      p_build_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
+      p_sub_nidx[nidx_in_set] = tree.RightChild(e.nidx);
     }
     ++nidx_in_set;
   }
@@ -132,13 +132,13 @@ struct GPUHistMakerDevice {
     auto tree = p_tree->HostScView();
     for (std::size_t i = 0, n = candidates.size(); i < n; i++) {
       auto const& e = candidates[i];
-      RegTree::Node split_node = tree.nodes[e.nid];
-      auto split_type = tree.SplitType(e.nid);
-      nodes.nidx.at(i) = e.nid;
-      nodes.left_nidx[i] = tree.LeftChild(e.nid);
-      nodes.right_nidx[i] = tree.RightChild(e.nid);
+      RegTree::Node split_node = tree.nodes[e.nidx];
+      auto split_type = tree.SplitType(e.nidx);
+      nodes.nidx.at(i) = e.nidx;
+      nodes.left_nidx[i] = tree.LeftChild(e.nidx);
+      nodes.right_nidx[i] = tree.RightChild(e.nidx);
       nodes.split_data[i] =
-          NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nid)};
+          NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nidx)};
 
       CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
@@ -299,8 +299,8 @@ struct GPUHistMakerDevice {
     auto sc_tree = tree.HostScView();
     for (std::size_t i = 0; i < candidates.size(); i++) {
       auto candidate = candidates.at(i);
-      bst_node_t left_nidx = sc_tree.LeftChild(candidate.nid);
-      bst_node_t right_nidx = sc_tree.RightChild(candidate.nid);
+      bst_node_t left_nidx = sc_tree.LeftChild(candidate.nidx);
+      bst_node_t right_nidx = sc_tree.RightChild(candidate.nidx);
       nidx[i * 2] = left_nidx;
       nidx[i * 2 + 1] = right_nidx;
       auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
@@ -482,7 +482,7 @@ struct GPUHistMakerDevice {
     bst_idx_t n_samples = 0;
     for (auto const& c : candidates) {
       for (auto const& part : this->partitioners_) {
-        n_samples += part->GetRows(c.nid).size();
+        n_samples += part->GetRows(c.nidx).size();
       }
     }
     // avoid copy if the kernel is small.
@@ -688,7 +688,7 @@ struct GPUHistMakerDevice {
 
     // Sanity check - have we created a leaf with no training instances?
     if (!collective::IsDistributed() && partitioners_.size() == 1) {
-      CHECK(partitioners_.front()->GetRows(candidate.nid).size() > 0)
+      CHECK(partitioners_.front()->GetRows(candidate.nidx).size() > 0)
           << "No training instances in this leaf!";
     }
 
@@ -708,27 +708,27 @@ struct GPUHistMakerDevice {
       CHECK(common::CheckNAN(candidate.split.fvalue));
       std::vector<common::CatBitField::value_type> split_cats;
 
-      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
+      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nidx);
       auto n_bins_feature = cuts_->FeatureBins(candidate.split.findex);
       split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
       CHECK_LE(split_cats.size(), h_cats.size());
       std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
 
       tree.ExpandCategorical(
-          candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
+          candidate.nidx, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
           base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_hess,
           left_hess, right_hess);
     } else {
       CHECK(!common::CheckNAN(candidate.split.fvalue));
-      tree.ExpandNode(candidate.nid, candidate.split.findex, candidate.split.fvalue,
+      tree.ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
                       candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight,
                       candidate.split.loss_chg, parent_hess,
           left_hess, right_hess);
     }
     evaluator_.ApplyTreeSplit(candidate, p_tree);
 
-    const auto& parent = tree[candidate.nid];
-    interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
+    const auto& parent = tree[candidate.nidx];
+    interaction_constraints.Split(candidate.nidx, parent.SplitIndex(), parent.LeftChild(),
                                   parent.RightChild());
   }
 
@@ -742,7 +742,7 @@ struct GPUHistMakerDevice {
         [=] __device__(auto const& gpair) { return quantiser.ToFixedPoint(gpair); });
     GradientPairInt64 root_sum_quantised =
         dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + this->gpair.size(),
-                   GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
+                   GradientPairInt64{}, cuda::std::plus<GradientPairInt64>{});
     using ReduceT = typename decltype(root_sum_quantised)::ValueT;
     auto rc = collective::GlobalSum(
         ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
@@ -838,8 +838,6 @@ std::pair<std::shared_ptr<common::HistogramCuts const>, bool> InitBatchCuts(
 }
 
 class GPUHistMaker : public TreeUpdater {
-  using GradientSumT = GradientPairPrecise;
-
  public:
   explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
   void Configure(const Args& args) override {
diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu
index 106004c63bac..659e1ba34fd9 100644
--- a/tests/cpp/tree/gpu_hist/test_driver.cu
+++ b/tests/cpp/tree/gpu_hist/test_driver.cu
@@ -1,3 +1,6 @@
+/**
+ * Copyright 2020-2025, XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include "../../../../src/tree/driver.h"
 #include "../../../../src/tree/gpu_hist/expand_entry.cuh"
@@ -17,7 +20,7 @@ TEST(GpuHist, DriverDepthWise) {
   split.right_sum = {0, 1};
   GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f);
   driver.Push({root});
-  EXPECT_EQ(driver.Pop().front().nid, 0);
+  EXPECT_EQ(driver.Pop().front().nidx, 0);
   driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}});
@@ -55,24 +58,24 @@ TEST(GpuHist, DriverLossGuided) {
   EXPECT_TRUE(driver.Pop().empty());
   GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f );
   driver.Push({root});
-  EXPECT_EQ(driver.Pop().front().nid, 0);
+  EXPECT_EQ(driver.Pop().front().nidx, 0);
   // Select high gain first
   driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}});
   auto res = driver.Pop();
   EXPECT_EQ(res.size(), 1);
-  EXPECT_EQ(res[0].nid, 2);
+  EXPECT_EQ(res[0].nidx, 2);
   res = driver.Pop();
   EXPECT_EQ(res.size(), 1);
-  EXPECT_EQ(res[0].nid, 1);
+  EXPECT_EQ(res[0].nidx, 1);
 
   // If equal gain, use nid
   driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   res = driver.Pop();
-  EXPECT_EQ(res[0].nid, 1);
+  EXPECT_EQ(res[0].nidx, 1);
   res = driver.Pop();
-  EXPECT_EQ(res[0].nid, 2);
+  EXPECT_EQ(res[0].nidx, 2);
 }
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index e400fc1315c8..1f92d5df8a58 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -72,12 +72,12 @@ TEST(Histogram, SubtractionTrack) {
   histogram.Reset(&ctx, max_cached_hist_nodes, fg_acc, n_total_bins, false);
   histogram.AllocateHistograms(&ctx, {0, 1, 2});
   GPUExpandEntry root;
-  root.nid = 0;
+  root.nidx = 0;
   auto need_build = histogram.SubtractHist(&ctx, {root}, {0}, {1});
 
   std::vector<GPUExpandEntry> candidates(2);
-  candidates[0].nid = 1;
-  candidates[1].nid = 2;
+  candidates[0].nidx = 1;
+  candidates[1].nidx = 2;
 
   need_build = histogram.SubtractHist(&ctx, candidates, {3, 5}, {4, 6});
   ASSERT_EQ(need_build.size(), 2);
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 2127f01f2130..4b54c445a88e 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -113,7 +113,7 @@ void GetSplit(RegTree* tree, float split_value, std::vector<GPUExpandEntry>* can
       /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
       /*left_sum=*/0.0f,
       /*right_sum=*/0.0f);
-  candidates->front().nid = 0;
+  candidates->front().nidx = 0;
   candidates->front().depth = 0;
   candidates->front().split.fvalue = split_value;
   candidates->front().split.findex = 0;

From a65817b41fb995d79e71fa07f435950415befed2 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 30 Oct 2025 12:25:07 +0800
Subject: [PATCH 211/224] [mt] Implement vector leaf for a decision stump on
 GPU. (#11781)

The implementation is bare minimum at the moment and has slow performance.

- Implement a basic histogram kernel.
- Implement a single-node evaluation function.
---
 include/xgboost/linalg.h                      |  11 +
 ops/script/lint_cpp.py                        |   1 -
 src/common/host_device_vector.cu              |   1 +
 src/tree/gpu_hist/evaluate_splits.cuh         |  45 ++-
 src/tree/gpu_hist/expand_entry.cu             |  52 +++
 src/tree/gpu_hist/expand_entry.cuh            |  45 ++-
 src/tree/gpu_hist/histogram.cu                | 205 ++++++++++-
 src/tree/gpu_hist/histogram.cuh               |  25 +-
 src/tree/gpu_hist/multi_evaluate_splits.cu    | 341 ++++++++++++++++++
 src/tree/gpu_hist/multi_evaluate_splits.cuh   |  26 ++
 src/tree/gpu_hist/quantiser.cuh               |  32 +-
 src/tree/param.h                              |  26 +-
 src/tree/updater_gpu_common.cuh               |  91 +++--
 src/tree/updater_gpu_hist.cu                  |  73 ++--
 src/tree/updater_gpu_hist.cuh                 | 190 ++++++++++
 tests/cpp/histogram_helpers.cu                |  84 +++++
 tests/cpp/histogram_helpers.h                 |  13 +-
 tests/cpp/test_histogram_helpers.cu           |  84 +++++
 tests/cpp/tree/gpu_hist/test_histogram.cu     |   2 +-
 .../gpu_hist/test_multi_evaluate_splits.cu    | 137 +++++++
 .../cpp/tree/gpu_hist/test_multi_histogram.cu |  51 +++
 21 files changed, 1445 insertions(+), 90 deletions(-)
 create mode 100644 src/tree/gpu_hist/expand_entry.cu
 create mode 100644 src/tree/gpu_hist/multi_evaluate_splits.cu
 create mode 100644 src/tree/gpu_hist/multi_evaluate_splits.cuh
 create mode 100644 src/tree/updater_gpu_hist.cuh
 create mode 100644 tests/cpp/histogram_helpers.cu
 create mode 100644 tests/cpp/test_histogram_helpers.cu
 create mode 100644 tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
 create mode 100644 tests/cpp/tree/gpu_hist/test_multi_histogram.cu

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 106251674a63..23a47dea45fe 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -645,6 +645,17 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
   return linalg::TensorView<T, 1>{{ptr, s}, {s}, device};
 }
 
+template <typename T>
+auto MakeVec(DeviceOrd device, common::Span<T> s) {
+  return linalg::TensorView<T, 1>{s, {s.size()}, device};
+}
+
+template <typename T>
+auto MakeVec(std::vector<T> const &v) {
+  return linalg::TensorView<std::add_const_t<T>, 1>{
+      {v.data(), v.size()}, {v.size()}, DeviceOrd::CPU()};
+}
+
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
   return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(),
diff --git a/ops/script/lint_cpp.py b/ops/script/lint_cpp.py
index 2d00b219ceab..b89f29c6edbe 100644
--- a/ops/script/lint_cpp.py
+++ b/ops/script/lint_cpp.py
@@ -71,7 +71,6 @@ def __init__(self) -> None:
                     "-build/include,",
                     "+build/namespaces",
                     "+build/include_what_you_use",
-                    "+build/include_order",
                 ]
             )
         )
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index bb50fb50cefa..d492285cb01a 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -414,6 +414,7 @@ template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
+template class HostDeviceVector<GradientPairInt64>;
 template class HostDeviceVector<std::int32_t>;   // bst_node_t
 template class HostDeviceVector<std::uint8_t>;
 template class HostDeviceVector<std::int8_t>;
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index ac1ee0b2be2a..7a7d9414306d 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -88,7 +88,7 @@ class GPUHistEvaluator {
   DeviceOrd device_;
 
   // Copy the categories from device to host asynchronously.
-  void CopyToHost( const std::vector<bst_node_t>& nidx);
+  void CopyToHost(const std::vector<bst_node_t> &nidx);
 
   /**
    * \brief Get host category storage of nidx for internal calculation.
@@ -127,7 +127,7 @@ class GPUHistEvaluator {
   }
 
   auto SortInput(int num_nodes, bst_feature_t total_bins) {
-    if(!need_sort_histogram_) return common::Span<SortPair>();
+    if (!need_sort_histogram_) return common::Span<SortPair>();
     sort_input_.resize(num_nodes * total_bins);
     return dh::ToSpan(sort_input_);
   }
@@ -179,8 +179,8 @@ class GPUHistEvaluator {
   /**
    * \brief Sort the histogram based on output to obtain contiguous partitions.
    */
-  common::Span<bst_feature_t const> SortHistogram(common::Span<const EvaluateSplitInputs> d_inputs,
-      EvaluateSplitSharedInputs shared_inputs,
+  common::Span<bst_feature_t const> SortHistogram(
+      common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
       TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
 
   // impl of evaluate splits, contains CUDA kernels so it's public
@@ -192,7 +192,7 @@ class GPUHistEvaluator {
   /**
    * \brief Evaluate splits for left and right nodes.
    */
-  void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
+  void EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
                       bst_feature_t max_active_features,
                       common::Span<const EvaluateSplitInputs> d_inputs,
                       EvaluateSplitSharedInputs shared_inputs,
@@ -203,6 +203,41 @@ class GPUHistEvaluator {
   GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                      EvaluateSplitSharedInputs shared_inputs);
 };
+
+// Input for evaluation kernel for each tree node.
+struct MultiEvaluateSplitInputs {
+  bst_node_t nidx;
+  bst_node_t depth;
+  common::Span<GradientPairInt64> parent_sum;
+  common::Span<const GradientPairInt64> histogram;
+};
+
+// Input for evaluation kernel that can be shared by multiple tree nodes.
+struct MultiEvaluateSplitSharedInputs {
+  // len == n_targets
+  common::Span<GradientQuantiser const> roundings;
+  // cut pointers
+  common::Span<std::uint32_t const> feature_segments;
+  // cut values
+  common::Span<float const> feature_values;
+  // min cut values
+  common::Span<float const> min_values;
+  // Number of bins for one feature and one target
+  bst_bin_t n_bins_per_feat_tar;
+  GPUTrainingParam param;
+
+  // Used for testing
+  enum OnePass {
+    kNone,      // normal
+    kForward,   // only perform the forward pass
+    kBackward,  // only perform the backward pass
+  } one_pass{kNone};
+
+  [[nodiscard]] XGBOOST_DEVICE bst_target_t Targets() const { return roundings.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_feature_t Features() const {
+    return this->feature_segments.size() - 1;
+  }
+};
 }  // namespace tree
 }  // namespace xgboost
 
diff --git a/src/tree/gpu_hist/expand_entry.cu b/src/tree/gpu_hist/expand_entry.cu
new file mode 100644
index 000000000000..9a71b0051200
--- /dev/null
+++ b/src/tree/gpu_hist/expand_entry.cu
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <cstddef>  // for size_t
+#include <ostream>  // for ostream
+#include <vector>   // for vector
+
+#include "../../common/device_helpers.cuh"  // for CopyDeviceSpanToVector
+#include "../../common/type.h"              // for GetValueT
+#include "expand_entry.cuh"
+
+namespace xgboost::tree::cuda_impl {
+std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
+  os << "MultiExpandEntry:\n"
+     << "nidx: " << e.nidx << "\n"
+     << "depth: " << e.depth << "\n"
+     << "loss: " << e.split.loss_chg << "\n";
+
+  std::vector<GradientPairInt64> h_node_sum(e.split.node_sum.size());
+  dh::CopyDeviceSpanToVector(&h_node_sum, e.split.node_sum);
+
+  auto print_span = [&](auto const& span) {
+    using T = typename common::GetValueT<decltype(span)>::value_type;
+    std::vector<T> h_vec(span.size());
+    dh::CopyDeviceSpanToVector(&h_vec, span);
+
+    os << "[";
+    for (std::size_t i = 0; i < h_vec.size(); ++i) {
+      os << h_vec[i];
+      if (i != h_vec.size() - 1) {
+        os << ", ";
+      }
+    }
+    os << "]\n";
+  };
+  if (e.split.dir == kRightDir) {
+    os << "left_sum: ";
+  } else {
+    os << "right_sum: ";
+  }
+  print_span(e.split.node_sum);
+
+  os << "base_weight: ";
+  print_span(e.base_weight);
+  os << "left_weight: ";
+  print_span(e.left_weight);
+  os << "right_weight: ";
+  print_span(e.right_weight);
+
+  return os;
+}
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/expand_entry.cuh b/src/tree/gpu_hist/expand_entry.cuh
index d38a4c2cfcf8..1ddcfb0f79ce 100644
--- a/src/tree/gpu_hist/expand_entry.cuh
+++ b/src/tree/gpu_hist/expand_entry.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #ifndef EXPAND_ENTRY_CUH_
 #define EXPAND_ENTRY_CUH_
@@ -125,6 +125,49 @@ struct GPUExpandEntry {
     this->split.right_sum = GradientPairInt64{right_sum[0], right_sum[1]};
   }
 };
+
+namespace cuda_impl {
+struct MultiExpandEntry {
+  bst_node_t nidx{0};
+  bst_node_t depth{0};
+  MultiSplitCandidate split;
+
+  common::Span<float const> base_weight;
+  common::Span<float const> left_weight;
+  common::Span<float const> right_weight;
+
+  MultiExpandEntry() = default;
+
+  [[nodiscard]] float GetLossChange() const { return split.loss_chg; }
+
+  [[nodiscard]] bst_node_t GetNodeId() const { return nidx; }
+
+  [[nodiscard]] bst_node_t GetDepth() const { return depth; }
+
+  [[nodiscard]] bool IsValid(TrainParam const& param, bst_node_t n_leaves) const {
+    // The split evaluator handles the zero Hessian case. It returns an empty expand entry
+    // if there the Hessian is invalid.
+    if (split.loss_chg <= kRtEps) {
+      return false;
+    }
+    if (base_weight.empty() || left_weight.empty() || right_weight.empty()) {
+      return false;
+    }
+    if (split.loss_chg < param.min_split_loss) {
+      return false;
+    }
+    if (param.max_depth > 0 && depth == param.max_depth) {
+      return false;
+    }
+    if (param.max_leaves > 0 && n_leaves == param.max_leaves) {
+      return false;
+    }
+    return true;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& entry);
+};
+}  // namespace cuda_impl
 }  // namespace xgboost::tree
 
 #endif  // EXPAND_ENTRY_CUH_
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 4a6ef15e4eae..8402bfbd8f0f 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -9,8 +9,10 @@
 #include "../../collective/aggregator.h"
 #include "../../common/deterministic.cuh"
 #include "../../common/device_helpers.cuh"
+#include "../../common/linalg_op.cuh"  // for tbegin
 #include "../../data/ellpack_page.cuh"
 #include "histogram.cuh"
+#include "../../common/nvtx_utils.h"
 #include "row_partitioner.cuh"
 #include "xgboost/base.h"
 
@@ -115,6 +117,27 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
                                  static_cast<T>(1) / to_floating_point_.GetHess());
 }
 
+MultiGradientQuantiser::MultiGradientQuantiser(Context const* ctx,
+                                               linalg::MatrixView<GradientPair const> gpair,
+                                               MetaInfo const& info) {
+  CHECK(gpair.FContiguous());
+  std::vector<GradientQuantiser> h_quantizers;
+  // TODO(jiamingy): We need to merge this into a single call for improved distributed training.
+  for (bst_target_t t = 0, n_targets = gpair.Shape(1); t < n_targets; ++t) {
+    h_quantizers.emplace_back(ctx, gpair.Slice(linalg::All(), t).Values(), info);
+  }
+  this->quantizers_ = h_quantizers;
+}
+
+namespace cuda_impl {
+void TransposeGradient(Context const* ctx, linalg::MatrixView<GradientPair const> in,
+                       linalg::MatrixView<GradientPair> out) {
+  CHECK(in.CContiguous());
+  CHECK(out.FContiguous());
+  thrust::copy_n(ctx->CUDACtx()->CTP(), in.Values().data(), in.Size(), linalg::tbegin(out));
+}
+}  // namespace cuda_impl
+
 XGBOOST_DEV_INLINE void AtomicAddGpairShared(xgboost::GradientPairInt64* dest,
                                              xgboost::GradientPairInt64 const& gpair) {
   auto dst_ptr = reinterpret_cast<int64_t*>(dest);
@@ -285,6 +308,36 @@ __global__ void __launch_bounds__(kBlockThreads)
   }
 }
 
+// Kernel for vector-leaf, bare minimum for now.
+template <typename Accessor, bool kCompressed, bool kDense, bool use_shared_memory_histograms,
+          std::int32_t kBlockThreads, std::int32_t kItemsPerThread>
+__global__ __launch_bounds__(kBlockThreads) void MultiHistKernel(
+    Accessor const matrix, const FeatureGroupsAccessor feature_groups,
+    common::Span<const RowPartitioner::RowIndexT> d_ridx, GradientPairInt64* d_node_hist,
+    linalg::MatrixView<const GradientPair> d_gpair,
+    common::Span<GradientQuantiser const> roundings) {
+  const FeatureGroup group = feature_groups[blockIdx.y];
+  std::int32_t feature_stride = kCompressed ? group.num_features : matrix.row_stride;
+  bst_idx_t n_elements = feature_stride * d_ridx.size();
+  using Idx = RowPartitioner::RowIndexT;
+  for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n_elements)) {
+    Idx ridx = d_ridx[idx / feature_stride];
+    auto fidx = FeatIdx(group, idx, feature_stride);
+    bst_bin_t compressed_bin = matrix.gidx_iter[IterIdx(matrix, ridx, fidx)];
+    if (compressed_bin != matrix.NullValue()) {
+      if (kCompressed) {
+        compressed_bin += matrix.feature_segments[fidx];
+      }
+      bst_target_t n_targets = roundings.size();
+      compressed_bin *= n_targets;
+      for (bst_target_t t = 0; t < n_targets; ++t) {
+        auto adjusted = roundings[t].ToFixedPoint(d_gpair(ridx, t));
+        AtomicAddGpairGlobal(d_node_hist + compressed_bin + t, adjusted);
+      }
+    }
+  }
+}
+
 namespace {
 constexpr std::int32_t kBlockThreads = 1024;
 constexpr std::int32_t kItemsPerThread = 8;
@@ -297,13 +350,24 @@ using DeduceKernelT = std::decay_t<decltype(Ker)>;
 template <typename Accessor>
 struct HistogramKernel {
   enum KernelType : std::size_t {
+    // single-target
     kGlobalCompr = 0,
     kGlobal = 1,
     kSharedCompr = 2,
     kShared = 3,
     kGlobalDense = 4,
     kSharedDense = 5,
+    // multi-target
+    kMtGlobalCompr = 6,
+    kMtGlobal = 7,
+    kMtSharedCompr = 8,
+    kMtShared = 9,
+    kMtGlobalDense = 10,
+    kMtSharedDense = 11,
   };
+  /**
+   * Single-target
+   */
   // Kernel for working with compressed sparse Ellpack using the global memory.
   using GlobalCompr = DeduceKernelT<
       SharedMemHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>>;
@@ -335,8 +399,42 @@ struct HistogramKernel {
   SharedDense shared_dense_kernel{
       SharedMemHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>};
 
+  /**
+   * Multi-target
+   */
+  // Kernel for working with compressed sparse Ellpack using the global memory.
+  using MtGlobalCompr =
+      DeduceKernelT<MultiHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>>;
+  MtGlobalCompr mt_global_compr_kernel{
+      MultiHistKernel<Accessor, true, false, false, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with sparse Ellpack using the global memory.
+  using MtGlobal =
+      DeduceKernelT<MultiHistKernel<Accessor, false, false, false, kBlockThreads, kItemsPerThread>>;
+  MtGlobal mt_global_kernel{
+      MultiHistKernel<Accessor, false, false, false, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with compressed sparse Ellpack using the shared memory.
+  using MtSharedCompr =
+      DeduceKernelT<MultiHistKernel<Accessor, true, false, true, kBlockThreads, kItemsPerThread>>;
+  MtSharedCompr mt_shared_compr_kernel{
+      MultiHistKernel<Accessor, true, false, true, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with sparse Ellpack using the shared memory.
+  using MtShared =
+      DeduceKernelT<MultiHistKernel<Accessor, false, false, true, kBlockThreads, kItemsPerThread>>;
+  MtShared mt_shared_kernel{
+      MultiHistKernel<Accessor, false, false, true, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with compressed dense ellpack using the global memory
+  using MtGlobalDense =
+      DeduceKernelT<MultiHistKernel<Accessor, true, true, false, kBlockThreads, kItemsPerThread>>;
+  MtGlobalDense mt_global_dense_kernel{
+      MultiHistKernel<Accessor, true, true, false, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with compressed dense ellpack using the shared memory
+  using MtSharedDense =
+      DeduceKernelT<MultiHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>>;
+  MtSharedDense mt_shared_dense_kernel{
+      MultiHistKernel<Accessor, true, true, true, kBlockThreads, kItemsPerThread>};
+
   bool shared{false};
-  std::array<std::uint32_t, 6> grid_sizes{0, 0, 0, 0, 0, 0};
+  std::array<std::uint32_t, 12> grid_sizes;
   std::size_t smem_size{0};
   std::size_t const max_shared_memory;
   bool const force_global;
@@ -345,6 +443,7 @@ struct HistogramKernel {
                   bool force_global_memory)
       : max_shared_memory{dh::MaxSharedMemoryOptin(ctx->Ordinal())},
         force_global{force_global_memory} {
+    std::fill_n(grid_sizes.data(), grid_sizes.size(), 0);
     // Decide whether to use shared memory
     // Opt into maximum shared memory for the kernel if necessary
     this->smem_size = feature_groups.ShmemSize();
@@ -371,13 +470,27 @@ struct HistogramKernel {
       this->grid_sizes[static_cast<std::size_t>(k)] = n_blocks_per_mp * n_mps;
     };
     // Initialize all kernel instantiations
-    std::array kernel_types{kGlobalCompr, kGlobal,      kSharedCompr,
-                            kShared,      kGlobalDense, kSharedDense};
-    std::int32_t k = 0;
-    for (auto& kernel : {global_compr_kernel, global_kernel, shared_compr_kernel, shared_kernel,
-                         global_dense_kernel, shared_dense_kernel}) {
-      init(kernel, kernel_types[k]);
-      ++k;
+    {
+      // Single target
+      std::array kernel_types{kGlobalCompr, kGlobal,      kSharedCompr,
+                              kShared,      kGlobalDense, kSharedDense};
+      std::int32_t k = 0;
+      for (auto& kernel : {global_compr_kernel, global_kernel, shared_compr_kernel, shared_kernel,
+                           global_dense_kernel, shared_dense_kernel}) {
+        init(kernel, kernel_types[k]);
+        ++k;
+      }
+    }
+    {
+      // Multi target
+      std::array kernel_types{kMtGlobalCompr, kMtGlobal,      kMtSharedCompr,
+                              kMtShared,      kMtGlobalDense, kMtSharedDense};
+      std::int32_t k = 0;
+      for (auto& kernel : {mt_global_compr_kernel, mt_global_kernel, mt_shared_compr_kernel,
+                           mt_shared_kernel, mt_global_dense_kernel, mt_shared_dense_kernel}) {
+        init(kernel, kernel_types[k]);
+        ++k;
+      }
     }
   }
 };
@@ -450,6 +563,67 @@ class DeviceHistogramDispatchAccessor {
       }
     }
   }
+
+  void BuildHistogram(CUDAContext const* ctx, Accessor const& matrix,
+                      FeatureGroupsAccessor const& feature_groups,
+                      linalg::MatrixView<GradientPair const> gpair,
+                      common::Span<const cuda_impl::RowIndexT> d_ridx,
+                      common::Span<GradientPairInt64> histogram,
+                      common::Span<GradientQuantiser const> roundings) const {
+    CHECK(kernel_);
+    // Otherwise launch blocks such that each block has a minimum amount of work to do
+    // There are fixed costs to launching each block, e.g. zeroing shared memory
+    // The below amount of minimum work was found by experimentation
+    int columns_per_group = common::DivRoundUp(matrix.row_stride, feature_groups.NumGroups());
+    // Average number of matrix elements processed by each group
+    std::size_t items_per_group = d_ridx.size() * columns_per_group;
+
+    // Allocate number of blocks such that each block has about kMinItemsPerBlock work
+    // Up to a maximum where the device is saturated
+    auto constexpr kMinItemsPerBlock = ItemsPerTile();
+
+    auto launcher = [&](auto const& kernel, std::uint32_t grid_size) {
+      CHECK_NE(grid_size, 0);
+      grid_size = std::min(grid_size, static_cast<std::uint32_t>(
+                                          common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
+      dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
+                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size, ctx->Stream()}(
+          kernel, matrix, feature_groups, d_ridx, histogram.data(), gpair, roundings);
+    };
+
+    using K = HistogramKernel<EllpackDeviceAccessor>::KernelType;
+    if (!this->kernel_->shared) {  // Use global memory
+      CHECK_EQ(this->kernel_->smem_size, 0);
+      if (matrix.IsDense()) {
+        CHECK(this->kernel_->force_global ||
+              (feature_groups.ShmemSize() >= this->kernel_->max_shared_memory));
+        launcher(this->kernel_->mt_global_dense_kernel,
+                 this->kernel_->grid_sizes[K::kMtGlobalDense]);
+      } else if (matrix.IsDenseCompressed()) {
+        CHECK(this->kernel_->force_global ||
+              (feature_groups.ShmemSize() >= this->kernel_->max_shared_memory));
+        launcher(this->kernel_->mt_global_compr_kernel,
+                 this->kernel_->grid_sizes[K::kMtGlobalCompr]);
+      } else {
+        // Sparse
+        launcher(this->kernel_->mt_global_kernel, this->kernel_->grid_sizes[K::kMtGlobal]);
+      }
+    } else {  // Use shared memory
+      CHECK_NE(this->kernel_->smem_size, 0);
+      CHECK(false) << MTNotImplemented();
+      if (matrix.IsDense()) {
+        launcher(this->kernel_->mt_shared_dense_kernel,
+                 this->kernel_->grid_sizes[K::kMtSharedDense]);
+      } else if (matrix.IsDenseCompressed()) {
+        // Dense
+        launcher(this->kernel_->mt_shared_compr_kernel,
+                 this->kernel_->grid_sizes[K::kMtSharedCompr]);
+      } else {
+        // Sparse
+        launcher(this->kernel_->mt_shared_kernel, this->kernel_->grid_sizes[K::kMtShared]);
+      }
+    }
+  }
 };
 
 // Dispatch between single buffer accessor and double buffer accessor.
@@ -506,6 +680,21 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx, EllpackAcces
   this->monitor_.Stop(__func__);
 }
 
+void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx, EllpackAccessor const& matrix,
+                                            FeatureGroupsAccessor const& feature_groups,
+                                            linalg::MatrixView<GradientPair const> gpair,
+                                            common::Span<const std::uint32_t> ridx,
+                                            common::Span<GradientPairInt64> histogram,
+                                            common::Span<GradientQuantiser const> roundings) {
+  xgboost_NVTX_FN_RANGE();
+  std::visit(
+      [&](auto&& matrix) {
+        this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram,
+                                      roundings);
+      },
+      matrix);
+}
+
 void DeviceHistogramBuilder::AllReduceHist(Context const* ctx, MetaInfo const& info,
                                            bst_node_t nidx, std::size_t num_histograms) {
   this->monitor_.Start(__func__);
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 99440860f66a..01002cd0ea22 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -59,7 +59,8 @@ class DeviceHistogramStorage {
   // overwritten when a new histogram is requested
   dh::device_vector<typename GradientSumT::ValueT> overflow_;
   std::map<int, size_t> overflow_nidx_map_;
-  int n_bins_;
+  // The total number of bins across all features and targets
+  bst_bin_t n_total_bins_;
   static constexpr std::size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
   static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
@@ -68,7 +69,7 @@ class DeviceHistogramStorage {
   explicit DeviceHistogramStorage() { data_.reserve(cuda_impl::DftReserveSize()); }
 
   void Reset(Context const* ctx, bst_bin_t n_total_bins, std::size_t max_cached_nodes) {
-    this->n_bins_ = n_total_bins;
+    this->n_total_bins_ = n_total_bins;
     auto d_data = data_.data().get();
     dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
                 [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
@@ -84,8 +85,8 @@ class DeviceHistogramStorage {
     return nidx_map_.find(nidx) != nidx_map_.cend() ||
            overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
   }
-  [[nodiscard]] int Bins() const { return n_bins_; }
-  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
+  [[nodiscard]] int Bins() const { return n_total_bins_; }
+  [[nodiscard]] size_t HistogramSize() const { return n_total_bins_ * kNumItemsInGradientSum; }
   dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
 
   void AllocateHistograms(Context const* ctx, std::vector<bst_node_t> const& new_nidxs) {
@@ -135,11 +136,11 @@ class DeviceHistogramStorage {
     if (nidx_map_.find(nidx) != nidx_map_.cend()) {
       // Fetch from normal cache
       auto ptr = data_.data().get() + nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_total_bins_)};
     } else {
       // Fetch from overflow
       auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_total_bins_)};
     }
   }
 };
@@ -154,7 +155,7 @@ class DeviceHistogramBuilder {
  public:
   explicit DeviceHistogramBuilder();
   ~DeviceHistogramBuilder();
-
+  // TODO(jiamingy): use a type larger than bst_bin_t since we need to support multi-target.
   void Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
              FeatureGroupsAccessor const& feature_groups, bst_bin_t n_total_bins,
              bool force_global_memory);
@@ -165,6 +166,13 @@ class DeviceHistogramBuilder {
                       common::Span<const std::uint32_t> ridx,
                       common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
 
+  void BuildHistogram(CUDAContext const* ctx, EllpackAccessor const& matrix,
+                      FeatureGroupsAccessor const& feature_groups,
+                      linalg::MatrixView<GradientPair const> gpair,
+                      common::Span<const std::uint32_t> ridx,
+                      common::Span<GradientPairInt64> histogram,
+                      common::Span<GradientQuantiser const> roundings);
+
   [[nodiscard]] auto GetNodeHistogram(bst_node_t nidx) { return hist_.GetNodeHistogram(nidx); }
 
   // num histograms is the number of contiguous histograms in memory to reduce over
@@ -188,7 +196,8 @@ class DeviceHistogramBuilder {
     return true;
   }
 
-  [[nodiscard]] auto SubtractHist(Context const* ctx, std::vector<GPUExpandEntry> const& candidates,
+  template <typename ExpandEntry>
+  [[nodiscard]] auto SubtractHist(Context const* ctx, std::vector<ExpandEntry> const& candidates,
                                   std::vector<bst_node_t> const& build_nidx,
                                   std::vector<bst_node_t> const& subtraction_nidx) {
     this->monitor_.Start(__func__);
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
new file mode 100644
index 000000000000..c6d0df60aaa2
--- /dev/null
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -0,0 +1,341 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <cub/block/block_scan.cuh>  // for BlockScan
+#include <cub/util_type.cuh>         // for KeyValuePair
+#include <cub/warp/warp_reduce.cuh>  // for WarpReduce
+#include <vector>                    // for vector
+
+#include "../../common/cuda_context.cuh"
+#include "../updater_gpu_common.cuh"  // for SumCallbackOp
+#include "multi_evaluate_splits.cuh"  // for MultiEvalauteSplitInputs, MultiEvaluateSplitSharedInputs
+#include "quantiser.cuh"              // for GradientQuantiser
+#include "xgboost/base.h"             // for GradientPairInt64
+#include "xgboost/span.h"             // for Span
+
+namespace xgboost::tree::cuda_impl {
+namespace {
+__device__ bst_bin_t RevBinIdx(bst_bin_t gidx_end, bst_bin_t bin_idx) {
+  return gidx_end - bin_idx - 1;
+}
+
+// Scan the histogram in 2 dim for all nodes
+// Each block for one feature and one target
+template <std::int32_t kBlockThreads>
+struct ScanHistogramAgent {
+  using BlockScanT = cub::BlockScan<GradientPairInt64, kBlockThreads>;
+
+  typename BlockScanT::TempStorage *tmp_storage;
+  bst_bin_t gidx_begin;
+  bst_bin_t gidx_end;
+  bst_target_t n_targets;
+
+  template <typename BinIndexFn>
+  __device__ void ScanFeature(common::Span<GradientPairInt64 const> node_histogram,
+                              common::Span<GradientPairInt64> scan_result, bst_target_t t,
+                              BinIndexFn &&bin_idx_fn) {
+    SumCallbackOp<GradientPairInt64> prefix_op;
+    // The forward pass and the backward pass differs in where the bin is read, which is
+    // specified by the callback bin_idx_fn(). They write to the same output location.
+    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockThreads) {
+      auto bin_idx = scan_begin + threadIdx.x;
+      bool thread_active = bin_idx < gidx_end;
+      auto bin =
+          thread_active ? node_histogram[bin_idx_fn(bin_idx) * n_targets + t] : GradientPairInt64{};
+      BlockScanT(*tmp_storage).InclusiveScan(bin, bin, cuda::std::plus{}, prefix_op);
+      if (thread_active) {
+        scan_result[bin_idx * n_targets + t] = bin;
+      }
+
+      // Required by the block scan.
+      __syncthreads();
+    }
+  }
+  // Forward scan pass
+  __device__ void Forward(common::Span<GradientPairInt64 const> node_histogram,
+                          common::Span<GradientPairInt64> scan_result, bst_target_t t) {
+    this->ScanFeature(node_histogram, scan_result, t, cuda::std::identity{});
+  }
+  // Backward scan pass for missing values
+  __device__ void Backward(common::Span<GradientPairInt64 const> node_histogram,
+                           common::Span<GradientPairInt64> scan_result, bst_target_t t) {
+    this->ScanFeature(node_histogram, scan_result, t,
+                      [&](bst_bin_t bin_idx) { return RevBinIdx(gidx_end, bin_idx); });
+  }
+};
+}  // namespace
+
+template <std::int32_t kBlockThreads>
+__global__ __launch_bounds__(kBlockThreads) void ScanHistogramKernel(
+    common::Span<MultiEvaluateSplitInputs const> nodes, MultiEvaluateSplitSharedInputs shared,
+    common::Span<common::Span<GradientPairInt64>> outputs) {
+  auto nidx_in_set = blockIdx.x;
+
+  auto const &node = nodes[nidx_in_set];
+  auto out = outputs[nidx_in_set];
+
+  auto fidx = blockIdx.y;
+  auto t = blockIdx.z;
+
+  bst_bin_t gidx_begin = shared.feature_segments[fidx];
+  bst_bin_t gidx_end = shared.feature_segments[fidx + 1];
+  bst_target_t n_targets = shared.Targets();
+
+  using AgentT = ScanHistogramAgent<kBlockThreads>;
+  __shared__ typename AgentT::BlockScanT::TempStorage tmp_storage;
+  ScanHistogramAgent<kBlockThreads> agent{&tmp_storage, gidx_begin, gidx_end, n_targets};
+
+  if (shared.one_pass != MultiEvaluateSplitSharedInputs::kBackward) {
+    auto forward = out.subspan(0, node.histogram.size());
+    agent.Forward(node.histogram, forward, t);
+  }
+  // TODO(jiamingy): Skip the backward pass if there's no missing value.
+  if (shared.one_pass != MultiEvaluateSplitSharedInputs::kForward) {
+    auto backward = out.subspan(node.histogram.size(), node.histogram.size());
+    agent.Backward(node.histogram, backward, t);
+  }
+}
+
+namespace {
+template <std::int32_t kBlockThreads>
+struct EvaluateSplitAgent {
+  using ArgMaxT = cub::KeyValuePair<std::uint32_t, double>;
+  using MaxReduceT = cub::WarpReduce<ArgMaxT>;
+  using SumReduceT = cub::WarpReduce<GradientPairInt64>;
+
+  struct TempStorage {
+    typename MaxReduceT::TempStorage max_reduce;
+    typename SumReduceT::TempStorage sum_reduce;
+  } *temp_storage;
+  bst_feature_t fidx;
+
+  template <std::int32_t d_step>
+  __device__ void Numerical(MultiEvaluateSplitInputs const &node,
+                            MultiEvaluateSplitSharedInputs const &shared,
+                            common::Span<GradientPairInt64 const> f_scan,
+                            MultiSplitCandidate *best_split) {
+    static_assert(d_step == +1 || d_step == -1, "Invalid step.");
+    // Calculate split gain for each bin
+    auto n_targets = shared.Targets();
+    auto roundings = shared.roundings;
+
+    bst_bin_t gidx_begin = shared.feature_segments[fidx];
+    bst_bin_t gidx_end = shared.feature_segments[fidx + 1];
+
+    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockThreads) {
+      auto bin_idx = scan_begin + threadIdx.x;
+      bool thread_active = bin_idx < gidx_end;
+
+      auto constexpr kNullGain = -std::numeric_limits<double>::infinity();
+      double gain = thread_active ? 0 : kNullGain;
+
+      if (thread_active) {
+        auto scan_bin = f_scan.subspan(bin_idx * n_targets, n_targets);
+        for (bst_target_t t = 0; t < n_targets; ++t) {
+          auto pg = roundings[t].ToFloatingPoint(node.parent_sum[t]);
+          // left
+          SPAN_LT(t, scan_bin.size());
+          auto left_sum = roundings[t].ToFloatingPoint(scan_bin[t]);
+          auto lw_t =
+              ::xgboost::tree::CalcWeight(shared.param, left_sum.GetGrad(), left_sum.GetHess());
+          // right
+          auto right_sum = pg - left_sum;
+          auto rw_t =
+              ::xgboost::tree::CalcWeight(shared.param, right_sum.GetGrad(), right_sum.GetHess());
+
+          gain += -lw_t * ThresholdL1(left_sum.GetGrad(), shared.param.reg_alpha);
+          gain += -rw_t * ThresholdL1(right_sum.GetGrad(), shared.param.reg_alpha);
+        }
+      }
+
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax{});
+      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+
+      if (threadIdx.x == best_thread && !isinf(gain)) {
+        // Update
+        bst_bin_t split_gidx = bin_idx;
+        if (d_step == -1) {
+          split_gidx = RevBinIdx(gidx_end, bin_idx);
+        }
+        float min_fvalue = shared.min_values[fidx];
+        float fvalue;
+        if (d_step == +1) {
+          fvalue = shared.feature_values[split_gidx];
+        } else {
+          if (split_gidx == gidx_begin) {
+            fvalue = min_fvalue;
+          } else {
+            fvalue = shared.feature_values[split_gidx - 1];
+          }
+        }
+        auto scan_bin = f_scan.subspan(bin_idx * n_targets, n_targets);
+        // Missing values go to right in the forward pass, go to left in the backward pass.
+        best_split->Update(gain, d_step == 1 ? kRightDir : kLeftDir, fvalue, fidx, scan_bin, false,
+                           shared.param, shared.roundings);
+      }
+
+      __syncwarp();
+    }
+  }
+};
+}  // namespace
+
+// Find the best split based on the scan result
+// Only a single node is working at the moment
+template <std::int32_t kBlockThreads>
+__global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
+    common::Span<MultiEvaluateSplitInputs const> nodes, MultiEvaluateSplitSharedInputs shared,
+    common::Span<common::Span<GradientPairInt64>> bin_scans,
+    common::Span<MultiSplitCandidate> out_candidates) {
+  using AgentT = EvaluateSplitAgent<kBlockThreads>;
+  __shared__ typename AgentT::TempStorage temp_storage;
+
+  auto fidx = blockIdx.x;
+  EvaluateSplitAgent<kBlockThreads> agent{&temp_storage, blockIdx.x};
+
+  auto n_targets = shared.Targets();
+  // The number of bins in a feature
+  auto f_hist_size =
+      (shared.feature_segments[fidx + 1] - shared.feature_segments[fidx]) * n_targets;
+  // TODO(jiamingy): Support more than a single node
+
+  if (shared.one_pass != MultiEvaluateSplitSharedInputs::kBackward) {
+    auto forward = bin_scans[0].subspan(0, nodes[0].histogram.size());
+    auto f_scan = forward.subspan(shared.feature_segments[fidx] * n_targets, f_hist_size);
+    agent.template Numerical<+1>(nodes[0], shared, f_scan, &out_candidates[fidx]);
+  }
+  if (shared.one_pass != MultiEvaluateSplitSharedInputs::kForward) {
+    auto backward = bin_scans[0].subspan(nodes[0].histogram.size(), nodes[0].histogram.size());
+    auto f_scan = backward.subspan(shared.feature_segments[fidx] * n_targets, f_hist_size);
+    agent.template Numerical<-1>(nodes[0], shared, f_scan, &out_candidates[fidx]);
+  }
+}
+
+[[nodiscard]] MultiExpandEntry MultiHistEvaluator::EvaluateSingleSplit(
+    Context const *ctx, MultiEvaluateSplitInputs input,
+    MultiEvaluateSplitSharedInputs shared_inputs) {
+  auto n_targets = shared_inputs.Targets();
+  CHECK_GE(n_targets, 2);
+  auto n_bins_per_feat_tar = shared_inputs.n_bins_per_feat_tar;
+  CHECK_GE(n_bins_per_feat_tar, 1);
+  auto n_features = shared_inputs.Features();
+  CHECK_GE(n_features, 1);
+
+  dh::device_vector<MultiEvaluateSplitInputs> inputs{input};
+
+  // Scan the histograms. One for forward and the other for backward.
+  this->scan_buffer_.resize(input.histogram.size() * 2);
+  thrust::fill(ctx->CUDACtx()->CTP(), this->scan_buffer_.begin(), this->scan_buffer_.end(),
+               GradientPairInt64{});
+  dh::device_vector<common::Span<GradientPairInt64>> scans{dh::ToSpan(this->scan_buffer_)};
+  std::uint32_t n_nodes = 1;
+  dim3 grid{n_nodes, n_features, n_targets};
+  std::uint32_t constexpr kBlockThreads = 32;
+  dh::LaunchKernel{grid, kBlockThreads}(  // NOLINT
+      ScanHistogramKernel<kBlockThreads>, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(scans));
+
+  dh::device_vector<MultiSplitCandidate> d_splits(n_features);
+  dh::LaunchKernel{n_features, kBlockThreads, 0, ctx->CUDACtx()->Stream()}(  // NOLINT
+      EvaluateSplitsKernel<kBlockThreads>, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(scans),
+      dh::ToSpan(d_splits));
+
+  auto best_split = thrust::reduce(
+      ctx->CUDACtx()->CTP(), d_splits.cbegin(), d_splits.cend(), MultiSplitCandidate{},
+      [] XGBOOST_DEVICE(MultiSplitCandidate const &lhs, MultiSplitCandidate const &rhs)
+          -> MultiSplitCandidate { return lhs.loss_chg > rhs.loss_chg ? lhs : rhs; });
+
+  if (best_split.node_sum.empty()) {
+    return {};
+  }
+
+  // Calculate leaf weights from gradient sum
+  this->weights_.resize(n_targets * 3);
+  auto d_weights = dh::ToSpan(this->weights_);
+  auto base_weight = d_weights.subspan(0, n_targets);
+  auto left_weight = d_weights.subspan(n_targets, n_targets);
+  auto right_weight = d_weights.subspan(n_targets * 2, n_targets);
+
+  dh::CachingDeviceUVector<float> d_parent_gain(1);
+  dh::CachingDeviceUVector<std::int32_t> sum_zero(2);
+
+  auto s_pg = dh::ToSpan(d_parent_gain);
+  auto s_sum_zero = dh::ToSpan(sum_zero);
+
+  dh::LaunchN(inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(std::size_t i) {
+    auto d_roundings = shared_inputs.roundings;
+    // the data inside the split candidates references the scan result.
+    auto node_sum = best_split.node_sum;
+
+    float parent_gain = 0;
+    for (bst_target_t t = 0; t < n_targets; ++t) {
+      auto quantizer = d_roundings[t];
+      auto sibling_sum = input.parent_sum[t] - node_sum[t];
+      auto sum = node_sum[t] + sibling_sum;
+      auto g = quantizer.ToFloatingPoint(sum);
+
+      base_weight[t] = CalcWeight(shared_inputs.param, g.GetGrad(), g.GetHess());
+      parent_gain += -base_weight[t] * ThresholdL1(g.GetGrad(), shared_inputs.param.reg_alpha);
+    }
+    s_pg[0] = parent_gain;
+
+    bool l = true, r = true;
+    for (bst_target_t t = 0; t < n_targets; ++t) {
+      auto quantizer = d_roundings[t];
+      auto sibling_sum = input.parent_sum[t] - node_sum[t];
+
+      l = l && (node_sum[t].GetQuantisedHess() - .0 == .0);
+      r = r && (sibling_sum.GetQuantisedHess() - .0 == .0);
+
+      if (best_split.dir == kRightDir) {
+        // forward pass, node_sum is the left sum
+        auto lg = quantizer.ToFloatingPoint(node_sum[t]);
+        left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
+        auto rg = quantizer.ToFloatingPoint(sibling_sum);
+        right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
+      } else {
+        // backward pass, node_sum is the right sum
+        auto rg = quantizer.ToFloatingPoint(node_sum[t]);
+        right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
+        auto lg = quantizer.ToFloatingPoint(sibling_sum);
+        left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
+      }
+
+      s_sum_zero[0] = l;
+      s_sum_zero[1] = r;
+    }
+  });
+  // Copy the result back to the host.
+  float parent_gain = 0;
+  dh::safe_cuda(cudaMemcpyAsync(&parent_gain, d_parent_gain.data(), sizeof(parent_gain),
+                                cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
+  best_split.loss_chg -= parent_gain;
+
+  std::vector<std::int32_t> h_sum_zero(s_sum_zero.size());
+  dh::safe_cuda(cudaMemcpyAsync(h_sum_zero.data(), s_sum_zero.data(), s_sum_zero.size_bytes(),
+                                cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
+  if (h_sum_zero[0] || h_sum_zero[1]) {
+    return {};
+  }
+
+  MultiExpandEntry entry{input.nidx,  input.depth, best_split,
+                         base_weight, left_weight, right_weight};
+  return entry;
+}
+
+void DebugPrintHistogram(common::Span<GradientPairInt64 const> node_hist,
+                         common::Span<GradientQuantiser const> roundings, bst_target_t n_targets) {
+  std::vector<GradientQuantiser> h_roundings;
+  thrust::copy(dh::tcbegin(roundings), dh::tcend(roundings), std::back_inserter(h_roundings));
+  dh::CopyDeviceSpanToVector(&h_roundings, roundings);
+
+  std::vector<GradientPairInt64> h_node_hist(node_hist.size());
+  dh::CopyDeviceSpanToVector(&h_node_hist, node_hist);
+  for (bst_target_t t = 0; t < n_targets; ++t) {
+    std::cout << "target:" << t << std::endl;
+    for (std::size_t i = t; i < h_node_hist.size() / n_targets; i += n_targets) {
+      std::cout << h_roundings[t].ToFloatingPoint(h_node_hist[i]) << ", ";
+    }
+    std::cout << std::endl;
+  }
+}
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh
new file mode 100644
index 000000000000..c2cddb8906e2
--- /dev/null
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#pragma once
+
+#include "../../common/device_vector.cuh"  // for device_vector
+#include "evaluate_splits.cuh"             // for MultiEvaluateSplitSharedInputs
+#include "quantiser.cuh"                   // for GradientQuantiser
+#include "xgboost/base.h"                  // for GradientPairInt64
+#include "xgboost/context.h"               // for Context
+
+namespace xgboost::tree::cuda_impl {
+class MultiHistEvaluator {
+  dh::device_vector<float> weights_;
+
+  dh::device_vector<GradientPairInt64> scan_buffer_;
+
+ public:
+  [[nodiscard]] MultiExpandEntry EvaluateSingleSplit(Context const *ctx,
+                                                     MultiEvaluateSplitInputs input,
+                                                     MultiEvaluateSplitSharedInputs shared_inputs);
+};
+
+void DebugPrintHistogram(common::Span<GradientPairInt64 const> node_hist,
+                         common::Span<GradientQuantiser const> roundings, bst_target_t n_targets);
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/quantiser.cuh b/src/tree/gpu_hist/quantiser.cuh
index 36bd5a1d36fe..31ce158e4600 100644
--- a/src/tree/gpu_hist/quantiser.cuh
+++ b/src/tree/gpu_hist/quantiser.cuh
@@ -1,11 +1,13 @@
 /**
- * Copyright 2020-2024, XGBoost Contributors
+ * Copyright 2020-2025, XGBoost Contributors
  */
 #pragma once
-#include "xgboost/base.h"     // for GradientPairPrecise, GradientPairInt64
-#include "xgboost/context.h"  // for Context
-#include "xgboost/data.h"     // for MetaInfo
-#include "xgboost/span.h"     // for Span
+#include "../../common/device_helpers.cuh"  // for ToSpan
+#include "../../common/device_vector.cuh"   // for device_vector
+#include "xgboost/base.h"                   // for GradientPairPrecise, GradientPairInt64
+#include "xgboost/context.h"                // for Context
+#include "xgboost/data.h"                   // for MetaInfo
+#include "xgboost/span.h"                   // for Span
 
 namespace xgboost::tree {
 class GradientQuantiser {
@@ -16,6 +18,9 @@ class GradientQuantiser {
   GradientPairPrecise to_floating_point_;
 
  public:
+  // Used for test
+  GradientQuantiser(GradientPairPrecise to_fixed, GradientPairPrecise to_float)
+      : to_fixed_point_{to_fixed}, to_floating_point_{to_float} {}
   GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
                     MetaInfo const& info);
   [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
@@ -36,4 +41,21 @@ class GradientQuantiser {
     return {g, h};
   }
 };
+
+// For vector leaf
+class MultiGradientQuantiser {
+ private:
+  dh::device_vector<GradientQuantiser> quantizers_;
+
+ public:
+  MultiGradientQuantiser(Context const* ctx, linalg::MatrixView<GradientPair const> gpair,
+                         MetaInfo const& info);
+
+  [[nodiscard]] auto Quantizers() const { return dh::ToSpan(this->quantizers_); }
+};
+
+namespace cuda_impl {
+void TransposeGradient(Context const* ctx, linalg::MatrixView<GradientPair const> in,
+                       linalg::MatrixView<GradientPair> out);
+}  // namespace cuda_impl
 }  // namespace xgboost::tree
diff --git a/src/tree/param.h b/src/tree/param.h
index 4431f85ad314..c4dcdcfe4bce 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -411,9 +411,29 @@ struct SplitEntryContainer {
        << "dft_left: " << s.DefaultLeft() << "\n"
        << "split_index: " << s.SplitIndex() << "\n"
        << "split_value: " << s.split_value << "\n"
-       << "is_cat: " << s.is_cat << "\n"
-       << "left_sum: " << s.left_sum << "\n"
-       << "right_sum: " << s.right_sum << std::endl;
+       << "is_cat: " << s.is_cat << "\n";
+    if constexpr (std::is_same_v<GradStats, GradientT>) {
+      os << "left_sum: " << s.left_sum << "\n"
+         << "right_sum: " << s.right_sum << std::endl;
+    } else {
+      auto print_vec = [&](auto const &vec) {
+        for (std::size_t i = 0; i < vec.size(); ++i) {
+          os << vec[i];
+          if (i != vec.size() - 1) {
+            os << ", ";
+          }
+        }
+      };
+
+      os << "left_sum: [";
+      print_vec(s.left_sum);
+      os << "]\n";
+
+      os << "right_sum: [";
+      print_vec(s.right_sum);
+      os << "]\n";
+    }
+
     return os;
   }
 
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index 5ea85994e653..6a249f33626c 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -40,7 +40,6 @@ struct GPUTrainingParam {
 };
 
 /**
- * @enum DefaultDirection node.cuh
  * @brief Default direction to be followed in case of missing values
  */
 enum DefaultDirection {
@@ -53,13 +52,13 @@ enum DefaultDirection {
 struct DeviceSplitCandidate {
   float loss_chg{-std::numeric_limits<float>::max()};
   DefaultDirection dir{kLeftDir};
-  int findex {-1};
-  float fvalue {0};
+  int findex{-1};
+  float fvalue{0};
   // categorical split, either it's the split category for OHE or the threshold for partition-based
   // split.
   bst_cat_t thresh{-1};
 
-  bool is_cat { false };
+  bool is_cat{false};
 
   GradientPairInt64 left_sum;
   GradientPairInt64 right_sum;
@@ -73,14 +72,14 @@ struct DeviceSplitCandidate {
     if (loss_chg_in > loss_chg &&
         quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
         quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
-        loss_chg = loss_chg_in;
-        dir = dir_in;
-        fvalue = fvalue_in;
-        is_cat = cat;
-        left_sum = left_sum_in;
-        right_sum = right_sum_in;
-        findex = findex_in;
-      }
+      loss_chg = loss_chg_in;
+      dir = dir_in;
+      fvalue = fvalue_in;
+      is_cat = cat;
+      left_sum = left_sum_in;
+      right_sum = right_sum_in;
+      findex = findex_in;
+    }
   }
 
   /**
@@ -90,18 +89,18 @@ struct DeviceSplitCandidate {
                                 bst_feature_t findex_in, GradientPairInt64 left_sum_in,
                                 GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
                                 const GradientQuantiser& quantiser) {
-      if (loss_chg_in > loss_chg &&
-          quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
-          quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
-        loss_chg = loss_chg_in;
-        dir = dir_in;
-        fvalue = std::numeric_limits<float>::quiet_NaN();
-        thresh = thresh_in;
-        is_cat = true;
-        left_sum = left_sum_in;
-        right_sum = right_sum_in;
-        findex = findex_in;
-      }
+    if (loss_chg_in > loss_chg &&
+        quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
+        quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
+      loss_chg = loss_chg_in;
+      dir = dir_in;
+      fvalue = std::numeric_limits<float>::quiet_NaN();
+      thresh = thresh_in;
+      is_cat = true;
+      left_sum = left_sum_in;
+      right_sum = right_sum_in;
+      findex = findex_in;
+    }
   }
 
   [[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
@@ -119,6 +118,44 @@ struct DeviceSplitCandidate {
   }
 };
 
+struct MultiSplitCandidate {
+  float loss_chg{-std::numeric_limits<float>::max()};
+  DefaultDirection dir{kLeftDir};
+  int findex{-1};
+  float fvalue{0};
+  // categorical split, either it's the split category for OHE or the threshold for partition-based
+  // split.
+  bst_cat_t thresh{-1};
+
+  bool is_cat{false};
+
+  common::Span<GradientPairInt64 const> node_sum;
+
+  MultiSplitCandidate() = default;
+
+  XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
+                             int findex_in, common::Span<GradientPairInt64 const> node_sum_in,
+                             bool cat, GPUTrainingParam const& /*param*/,
+                             common::Span<GradientQuantiser const> /*roundings*/) {
+    // TODO(jiamingy): Support min_child_weight
+    if (loss_chg_in > loss_chg) {
+      loss_chg = loss_chg_in;
+      dir = dir_in;
+      fvalue = fvalue_in;
+      is_cat = cat;
+      node_sum = node_sum_in;
+      findex = findex_in;
+    }
+  }
+  XGBOOST_DEVICE void Update(MultiSplitCandidate const& that, GPUTrainingParam const& param,
+                             common::Span<GradientQuantiser const> roundings) {
+    this->Update(that.loss_chg, that.dir, that.fvalue, that.findex, that.node_sum, that.is_cat,
+                 param, roundings);
+  }
+
+  [[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
+};
+
 namespace cuda_impl {
 inline BatchParam HistBatch(TrainParam const& param) {
   auto p = BatchParam{param.max_bin, TrainParam::DftSparseThreshold()};
@@ -139,9 +176,9 @@ inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hes
 template <typename T>
 struct SumCallbackOp {
   // Running prefix
-  T running_total;
-  // Constructor
-  XGBOOST_DEVICE SumCallbackOp() : running_total(T()) {}
+  T running_total{T{}};
+
+  SumCallbackOp() = default;
   XGBOOST_DEVICE T operator()(T block_aggregate) {
     T old_prefix = running_total;
     running_total += block_aggregate;
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 4fc5ec866c3e..27a7e2b5c7dd 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,15 +1,14 @@
 /**
  * Copyright 2017-2025, XGBoost contributors
  */
-#include <thrust/functional.h>  // for plus
-#include <thrust/transform.h>   // for transform
+#include <thrust/transform.h>  // for transform
 
-#include <algorithm>  // for max
-#include <cmath>      // for isnan
-#include <cstddef>    // for size_t
-#include <memory>     // for unique_ptr, make_unique
-#include <utility>    // for move
-#include <vector>     // for vector
+#include <algorithm>        // for max
+#include <cmath>            // for isnan
+#include <cuda/functional>  // for plus
+#include <memory>           // for unique_ptr, make_unique
+#include <utility>          // for move
+#include <vector>           // for vector
 
 #include "../collective/aggregator.h"
 #include "../common/categorical.h"     // for KCatBitField
@@ -28,8 +27,8 @@
 #include "driver.h"
 #include "gpu_hist/evaluate_splits.cuh"
 #include "gpu_hist/expand_entry.cuh"
-#include "gpu_hist/feature_groups.cuh"
-#include "gpu_hist/gradient_based_sampler.cuh"
+#include "gpu_hist/feature_groups.cuh"          // for FeatureGroups
+#include "gpu_hist/gradient_based_sampler.cuh"  // for GradientBasedSampler
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/row_partitioner.cuh"  // for RowPartitioner
 #include "hist/hist_param.h"             // for HistMakerTrainParam
@@ -37,6 +36,7 @@
 #include "sample_position.h"             // for SamplePosition
 #include "tree_view.h"                   // for ScalarTreeView
 #include "updater_gpu_common.cuh"        // for HistBatch
+#include "updater_gpu_hist.cuh"          // for MultiTargetHistMaker
 #include "xgboost/base.h"                // for bst_idx_t
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for DMatrix
@@ -55,8 +55,6 @@ using cuda_impl::HistBatch;
 using xgboost::cuda_impl::StaticBatch;
 
 namespace {
-// Use a large number to handle external memory with deep trees.
-inline constexpr std::size_t kMaxNodeBatchSize = 1024;
 inline constexpr std::size_t kNeedCopyThreshold = 4;
 }  // anonymous namespace
 
@@ -230,7 +228,9 @@ struct GPUHistMakerDevice {
       auto n_samples = batch_ptr.at(k + 1) - base_ridx;
       partitioners_[k]->Reset(ctx_, n_samples, base_ridx);
     }
+    // TODO(jiamingy): Handle reduced number of batches
     CHECK_EQ(partitioners_.size(), n_batches);
+
     if (is_concat) {
       CHECK_EQ(partitioners_.size(), 1);
       CHECK_EQ(partitioners_.front()->Size(), p_fmat->Info().num_row_);
@@ -340,11 +340,10 @@ struct GPUHistMakerDevice {
     monitor.Start(__func__);
     auto d_node_hist = histogram_.GetNodeHistogram(nidx);
     auto d_ridx = partitioners_.at(k)->GetRows(nidx);
-    page.Impl()->Visit(this->ctx_, {}, [&](auto&& acc) {
-      this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
-                                      feature_groups_->DeviceAccessor(ctx_->Device()), this->gpair,
-                                      d_ridx, d_node_hist, *quantiser);
-    });
+    auto acc = page.Impl()->GetDeviceEllpack(this->ctx_, {});
+    this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
+                                    feature_groups_->DeviceAccessor(ctx_->Device()), this->gpair,
+                                    d_ridx, d_node_hist, *quantiser);
     monitor.Stop(__func__);
   }
 
@@ -498,7 +497,8 @@ struct GPUHistMakerDevice {
     }
   };
 
-  // Update position and build histogram.
+  // Update position and build histogram. We merge these two functions for external
+  // memory, where we want to bundle as many computation as possible for each data read.
   void PartitionAndBuildHist(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& expand_set,
                              std::vector<GPUExpandEntry> const& candidates, RegTree const* p_tree) {
     if (expand_set.empty()) {
@@ -773,7 +773,7 @@ struct GPUHistMakerDevice {
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
-    Driver<GPUExpandEntry> driver{param, kMaxNodeBatchSize};
+    Driver<GPUExpandEntry> driver{param, cuda_impl::kMaxNodeBatchSize};
 
     p_fmat = this->Reset(gpair_all, p_fmat);
     driver.Push({this->InitRoot(p_fmat, p_tree)});
@@ -866,12 +866,10 @@ class GPUHistMaker : public TreeUpdater {
               const std::vector<RegTree*>& trees) override {
     monitor_.Start(__func__);
 
-    CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
-    auto gpair_hdv = gpair->Data();
     // build tree
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
-      this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
+      this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
       this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
       ++t_idx;
     }
@@ -893,9 +891,11 @@ class GPUHistMaker : public TreeUpdater {
     auto batch = HistBatch(*param);
     auto [cuts, dense_compressed] = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr);
 
-    this->maker = std::make_unique<GPUHistMakerDevice>(ctx_, *param, &hist_maker_param_,
-                                                       column_sampler_, batch, p_fmat->Info(),
-                                                       batch_ptr, cuts, dense_compressed);
+    this->p_scimpl_ = std::make_unique<GPUHistMakerDevice>(ctx_, *param, &hist_maker_param_,
+                                                           column_sampler_, batch, p_fmat->Info(),
+                                                           batch_ptr, cuts, dense_compressed);
+    this->p_mtimpl_ = std::make_unique<cuda_impl::MultiTargetHistMaker>(
+        this->ctx_, *param, &hist_maker_param_, batch_ptr, cuts, dense_compressed);
 
     p_last_fmat_ = p_fmat;
     initialised_ = true;
@@ -912,24 +912,32 @@ class GPUHistMaker : public TreeUpdater {
     monitor_.Stop(__func__);
   }
 
-  void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+  void UpdateTree(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
     this->InitData(param, p_fmat, p_tree);
     gpair->SetDevice(ctx_->Device());
-    maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
+    auto gpair_hdv = gpair->Data();
+    if (p_tree->IsMultiTarget()) {
+      p_mtimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
+    } else {
+      p_scimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
+    }
   }
 
   bool UpdatePredictionCache(const DMatrix* data, linalg::MatrixView<float> p_out_preds) override {
-    if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
+    if (p_scimpl_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
+      return false;
+    }
+    if (this->p_last_tree_->IsMultiTarget()) {
       return false;
     }
     monitor_.Start(__func__);
-    bool result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
+    bool result = p_scimpl_->UpdatePredictionCache(p_out_preds, p_last_tree_);
     monitor_.Stop(__func__);
     return result;
   }
 
-  std::unique_ptr<GPUHistMakerDevice> maker;  // NOLINT
+
 
   [[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; }
   [[nodiscard]] bool HasNodePosition() const override { return true; }
@@ -937,6 +945,11 @@ class GPUHistMaker : public TreeUpdater {
  private:
   bool initialised_{false};
 
+  // Scalar tree implementation
+  std::unique_ptr<GPUHistMakerDevice> p_scimpl_;
+  // Vector tree implementation
+  std::unique_ptr<cuda_impl::MultiTargetHistMaker> p_mtimpl_;
+
   HistMakerTrainParam hist_maker_param_;
 
   DMatrix* p_last_fmat_{nullptr};
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
new file mode 100644
index 000000000000..5e11c9a43501
--- /dev/null
+++ b/src/tree/updater_gpu_hist.cuh
@@ -0,0 +1,190 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#pragma once
+#include <thrust/reduce.h>  // for reduce_by_key
+
+#include <memory>  // for unique_ptr
+#include <vector>  // for vector
+
+#include "../common/device_helpers.cuh"        // for MakeTransformIterator
+#include "driver.h"                            // for Driver
+#include "gpu_hist/feature_groups.cuh"         // for FeatureGroups
+#include "gpu_hist/histogram.cuh"              // for DeviceHistogramBuilder
+#include "gpu_hist/multi_evaluate_splits.cuh"  // for MultiHistEvaluator
+#include "gpu_hist/row_partitioner.cuh"        // for RowPartitioner
+#include "hist/hist_param.h"                   // for HistMakerTrainParam
+#include "xgboost/base.h"                      // for bst_idx_t
+#include "xgboost/context.h"                   // for Context
+#include "xgboost/host_device_vector.h"        // for HostDeviceVector
+#include "xgboost/tree_model.h"                // for RegTree
+
+namespace xgboost::tree::cuda_impl {
+// Use a large number to handle external memory with deep trees.
+inline constexpr std::size_t kMaxNodeBatchSize = 1024;
+using xgboost::cuda_impl::StaticBatch;
+
+/**
+ * @brief Implementation for vector leaf.
+ */
+class MultiTargetHistMaker {
+ private:
+  Context const* ctx_;
+
+  TrainParam const param_;
+  std::vector<std::unique_ptr<RowPartitioner>> partitioners_;
+
+  HistMakerTrainParam const* hist_param_;
+  std::shared_ptr<common::HistogramCuts const> const cuts_;
+  std::unique_ptr<FeatureGroups> feature_groups_;
+  DeviceHistogramBuilder histogram_;
+  std::unique_ptr<MultiGradientQuantiser> quantiser_;
+
+  MultiHistEvaluator evaluator_;
+
+  linalg::Matrix<GradientPair> dh_gpair_;
+  std::vector<bst_idx_t> const batch_ptr_;
+
+  void BuildHist(EllpackPage const& page, bst_node_t nidx) {
+    auto d_gpair = this->dh_gpair_.View(this->ctx_->Device());
+    CHECK(!this->partitioners_.empty());
+    auto d_ridx = this->partitioners_.front()->GetRows(nidx);
+    auto hist = histogram_.GetNodeHistogram(nidx);
+    auto roundings = this->quantiser_->Quantizers();
+    auto acc = page.Impl()->GetDeviceEllpack(this->ctx_, {});
+    histogram_.BuildHistogram(this->ctx_->CUDACtx(), acc,
+                              this->feature_groups_->DeviceAccessor(this->ctx_->Device()), d_gpair,
+                              d_ridx, hist, roundings);
+  }
+
+ public:
+  void Reset(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, RegTree* p_tree) {
+    bst_idx_t n_targets = p_tree->NumTargets();
+    auto in_gpair = linalg::MakeTensorView(ctx_, gpair_all, p_fmat->Info().num_row_, n_targets);
+
+    /**
+     * Initialize the partitioners
+     */
+    std::size_t n_batches = p_fmat->NumBatches();
+    if (!partitioners_.empty()) {
+      CHECK_EQ(partitioners_.size(), n_batches);
+    }
+    for (std::size_t k = 0; k < n_batches; ++k) {
+      if (partitioners_.size() != n_batches) {
+        // First run.
+        partitioners_.emplace_back(std::make_unique<RowPartitioner>());
+      }
+      auto base_ridx = this->batch_ptr_[k];
+      auto n_samples = this->batch_ptr_.at(k + 1) - base_ridx;
+      partitioners_[k]->Reset(ctx_, n_samples, base_ridx);
+    }
+    this->partitioners_.resize(n_batches);
+
+    /**
+     * Initialize the histogram
+     */
+    std::size_t shape[2]{p_fmat->Info().num_row_, n_targets};
+    dh_gpair_ = linalg::Matrix<GradientPair>{shape, ctx_->Device(), linalg::kF};
+    TransposeGradient(this->ctx_, in_gpair, dh_gpair_.View(ctx_->Device()));
+
+    this->quantiser_ = std::make_unique<MultiGradientQuantiser>(
+        this->ctx_, dh_gpair_.View(ctx_->Device()), p_fmat->Info());
+
+    bool force_global = true;
+    histogram_.Reset(this->ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()),
+                     feature_groups_->DeviceAccessor(ctx_->Device()),
+                     cuts_->TotalBins() * n_targets, force_global);
+  }
+
+  [[nodiscard]] MultiExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree) {
+    auto d_gpair = dh_gpair_.View(ctx_->Device());
+    auto n_samples = d_gpair.Shape(0);
+    auto n_targets = d_gpair.Shape(1);
+
+    dh::device_vector<GradientPairInt64> root_sum(n_targets);
+
+    auto key_it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
+      auto cidx = i / n_samples;
+      return cidx;
+    });
+    auto d_roundings = quantiser_->Quantizers();
+    auto val_it =
+        dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) -> GradientPairInt64 {
+          auto cidx = i / n_samples;
+          auto ridx = i % n_samples;
+          auto g = d_gpair(ridx, cidx);
+          return d_roundings[cidx].ToFixedPoint(g);
+        });
+    thrust::reduce_by_key(ctx_->CUDACtx()->CTP(), key_it, key_it + d_gpair.Size(), val_it,
+                          thrust::make_discard_iterator(), root_sum.begin());
+
+    histogram_.AllocateHistograms(ctx_, {RegTree::kRoot});
+
+    CHECK_EQ(p_fmat->NumBatches(), this->partitioners_.size());
+    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+      this->BuildHist(page, RegTree::kRoot);
+    }
+
+    auto node_hist = this->histogram_.GetNodeHistogram(RegTree::kRoot);
+    MultiEvaluateSplitInputs input{RegTree::kRoot, p_tree->GetDepth(RegTree::kRoot),
+                                   dh::ToSpan(root_sum), node_hist};
+    GPUTrainingParam param{this->param_};
+    MultiEvaluateSplitSharedInputs shared_inputs{d_roundings,
+                                                 this->cuts_->cut_ptrs_.ConstDeviceSpan(),
+                                                 this->cuts_->cut_values_.ConstDeviceSpan(),
+                                                 this->cuts_->min_vals_.ConstDeviceSpan(),
+                                                 this->param_.max_bin,
+                                                 param};
+    auto entry = this->evaluator_.EvaluateSingleSplit(ctx_, input, shared_inputs);
+
+    // TODO(jiamingy): Support learning rate.
+    std::vector<float> h_base_weight(entry.base_weight.size());
+    dh::CopyDeviceSpanToVector(&h_base_weight, entry.base_weight);
+    p_tree->SetLeaf(RegTree::kRoot, linalg::MakeVec(h_base_weight));
+    return entry;
+  }
+
+  void ApplySplit(MultiExpandEntry const& candidate, RegTree* p_tree) {
+    // TODO(jiamingy): Support learning rate.
+    // TODO(jiamingy): Avoid device to host copies.
+    std::vector<float> h_base_weight(candidate.base_weight.size());
+    std::vector<float> h_left_weight(candidate.left_weight.size());
+    std::vector<float> h_right_weight(candidate.right_weight.size());
+    dh::CopyDeviceSpanToVector(&h_base_weight, candidate.base_weight);
+    dh::CopyDeviceSpanToVector(&h_left_weight, candidate.left_weight);
+    dh::CopyDeviceSpanToVector(&h_right_weight, candidate.right_weight);
+    p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
+                       candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight),
+                       linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight));
+  }
+
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const*,
+                  RegTree* p_tree, HostDeviceVector<bst_node_t>*) {
+    Driver<MultiExpandEntry> driver{param_, kMaxNodeBatchSize};
+
+    this->Reset(gpair_all, p_fmat, p_tree);
+    driver.Push({this->InitRoot(p_fmat, p_tree)});
+    // The set of leaves that can be expanded asynchronously
+    auto expand_set = driver.Pop();
+    while (!expand_set.empty()) {
+      for (auto& candidate : expand_set) {
+        this->ApplySplit(candidate, p_tree);
+      }
+      expand_set = driver.Pop();
+    }
+  }
+
+  explicit MultiTargetHistMaker(Context const* ctx, TrainParam param,
+                                HistMakerTrainParam const* hist_param,
+                                std::vector<bst_idx_t> batch_ptr,
+                                std::shared_ptr<common::HistogramCuts const> cuts,
+                                bool dense_compressed)
+      : ctx_{ctx},
+        param_{std::move(param)},
+        hist_param_{hist_param},
+        cuts_{std::move(cuts)},
+        feature_groups_{std::make_unique<FeatureGroups>(*cuts_, dense_compressed,
+                                                        dh::MaxSharedMemoryOptin(ctx_->Ordinal()))},
+        batch_ptr_{std::move(batch_ptr)} {}
+};
+}  // namespace xgboost::tree::cuda_impl
diff --git a/tests/cpp/histogram_helpers.cu b/tests/cpp/histogram_helpers.cu
new file mode 100644
index 000000000000..1b5f338c40d7
--- /dev/null
+++ b/tests/cpp/histogram_helpers.cu
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <xgboost/base.h>  // for bst_feature_t
+#include <xgboost/data.h>  // for FeatureType
+#include <xgboost/span.h>  // for Span
+
+#include <memory>  // for make_unique
+#include <random>  // for uniform_real_distribution
+#include <vector>  // for vector
+
+#include "../../src/common/device_vector.cuh"  // for device_vector
+#include "../../src/common/hist_util.h"        // for HistogramCuts
+#include "../../src/data/device_adapter.cuh"   // for CupyAdapter, GetRowCounts
+#include "../../src/data/ellpack_page.cuh"     // for EllpackPageImpl
+#include "histogram_helpers.h"
+
+namespace xgboost {
+[[nodiscard]] std::unique_ptr<EllpackPageImpl> MakeEllpackForTest(Context const* ctx,
+                                                                  bst_idx_t n_samples,
+                                                                  bst_feature_t n_features,
+                                                                  bst_bin_t n_bins_per_feat) {
+  // Construct the histogram bins
+  std::vector<std::uint32_t> cut_indptr(n_features + 1, 0);
+  for (std::size_t i = 1; i < cut_indptr.size(); ++i) {
+    cut_indptr[i] = i * n_bins_per_feat;
+  }
+  std::vector<float> cut_values;
+  for (bst_feature_t f_idx = 0; f_idx < n_features; ++f_idx) {
+    for (bst_bin_t bin_idx = 0; bin_idx < n_bins_per_feat; ++bin_idx) {
+      cut_values.push_back(bin_idx + 1.0f);
+    }
+  }
+  std::vector<float> min_values;
+  std::default_random_engine rng(2025);
+  std::uniform_real_distribution<float> min_dist(-1.0, -0.5);
+  for (bst_feature_t f_idx = 0; f_idx < n_features; ++f_idx) {
+    min_values.push_back(min_dist(rng));
+  }
+  auto p_cuts = std::make_shared<common::HistogramCuts>();
+  p_cuts->cut_ptrs_.HostVector() = cut_indptr;
+  p_cuts->cut_values_.HostVector() = cut_values;
+  p_cuts->min_vals_.HostVector() = min_values;
+
+  // Construct the data
+  auto n_values_per_bin = n_samples / n_bins_per_feat;
+
+  linalg::Matrix<float> values{
+      {n_samples, static_cast<bst_idx_t>(n_features)}, DeviceOrd::CPU(), linalg::kF};
+  auto& h_values = values.Data()->HostVector();
+  h_values.clear();
+
+  for (bst_feature_t f_idx = 0; f_idx < n_features; ++f_idx) {
+    for (bst_bin_t bin_idx = 0; bin_idx < n_bins_per_feat; ++bin_idx) {
+      // min-max value for the current bin
+      auto min_value = static_cast<float>(bin_idx + kRtEps);
+      auto max_value = static_cast<float>(bin_idx + 1.0 - kRtEps);
+      std::uniform_real_distribution<float> dist(min_value, max_value);
+      for (std::size_t i = 0; i < n_values_per_bin; ++i) {
+        h_values.emplace_back(dist(rng));
+      }
+      if (bin_idx == n_bins_per_feat - 1) {
+        auto remainder = n_samples % n_bins_per_feat;
+        for (std::size_t i = 0; i < remainder; ++i) {
+          h_values.emplace_back(dist(rng));
+        }
+      }
+    }
+  }
+  CHECK_EQ(h_values.size(), n_samples * n_features);
+
+  auto str = linalg::ArrayInterfaceStr(values.View(ctx->Device()));
+  auto adapter = data::CupyAdapter{StringView{str}};
+  dh::device_vector<bst_idx_t> row_counts(n_samples);
+  auto missing = std::numeric_limits<float>::quiet_NaN();
+  bst_idx_t row_stride =
+      GetRowCounts(ctx, adapter.Value(), dh::ToSpan(row_counts), ctx->Device(), missing);
+  auto ellpack = std::make_unique<EllpackPageImpl>(
+      ctx, adapter.Value(), missing, true, dh::ToSpan(row_counts),
+      common::Span<FeatureType const>{}, row_stride, n_samples, p_cuts);
+
+  return ellpack;
+}
+}  // namespace xgboost
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index ff021e819821..3d897c2ded80 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost contributors
+ * Copyright 2020-2025, XGBoost contributors
  */
 #pragma once
 
@@ -52,5 +52,16 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(Context const* ctx, int
 
   return page;
 }
+
+/**
+ * @brief Create an ellpack page with evenly distributed values across histogram bins.
+ *
+ * @note The last bin contains all the extra values if @ref n_samples is not divisible by
+ *       @ref n_bins_per_feat. Otherwise, all bins contain the same number of values.
+ */
+[[nodiscard]] std::unique_ptr<EllpackPageImpl> MakeEllpackForTest(Context const* ctx,
+                                                                  bst_idx_t n_samples,
+                                                                  bst_feature_t n_features,
+                                                                  bst_bin_t n_bins_per_feat);
 #endif
 }  // namespace xgboost
diff --git a/tests/cpp/test_histogram_helpers.cu b/tests/cpp/test_histogram_helpers.cu
new file mode 100644
index 000000000000..8f2ce06bf3d6
--- /dev/null
+++ b/tests/cpp/test_histogram_helpers.cu
@@ -0,0 +1,84 @@
+#include <gtest/gtest.h>
+#include <xgboost/context.h>
+
+#include "histogram_helpers.h"
+
+namespace xgboost {
+namespace {
+// Count occurrences of each bin for each feature
+template <typename Accessor>
+auto CountBins(Accessor const& accessor, bst_bin_t n_bins_per_feat) {
+  auto n_samples = accessor.NumRows();
+  auto n_features = accessor.NumFeatures();
+  std::vector<std::vector<bst_idx_t>> bin_counts(n_features,
+                                                 std::vector<bst_idx_t>(n_bins_per_feat, 0));
+
+  // Count occurrences of each bin for each feature
+  for (bst_idx_t row = 0; row < n_samples; ++row) {
+    for (bst_feature_t feat = 0; feat < n_features; ++feat) {
+      bst_idx_t idx = row * accessor.row_stride + feat;
+      bst_bin_t bin = accessor.gidx_iter[idx];
+
+      // The bin values are already local to each feature
+      EXPECT_GE(bin, 0);
+      EXPECT_LT(bin, n_bins_per_feat);
+      bin_counts[feat][bin]++;
+    }
+  }
+  return bin_counts;
+}
+}  // namespace
+
+TEST(HistogramHelpers, MakeEllpack) {
+  auto ctx = MakeCUDACtx(0);
+
+  bst_idx_t n_samples = 100;
+  bst_feature_t n_features = 5;
+  bst_bin_t n_bins_per_feat = 10;
+
+  auto ellpack = MakeEllpackForTest(&ctx, n_samples, n_features, n_bins_per_feat);
+
+  ASSERT_NE(ellpack, nullptr);
+  EXPECT_EQ(ellpack->Size(), n_samples);
+  EXPECT_EQ(ellpack->Cuts().NumFeatures(), n_features);
+
+  // Test histogram cuts structure
+  const auto& cuts = ellpack->Cuts();
+  EXPECT_EQ(cuts.NumFeatures(), n_features);
+  EXPECT_EQ(cuts.TotalBins(), n_features * n_bins_per_feat);
+
+  // Verify cut pointers are correct
+  const auto& cut_ptrs = cuts.Ptrs();
+  EXPECT_EQ(cut_ptrs.size(), n_features + 1);
+  for (bst_feature_t f = 0; f < n_features; ++f) {
+    EXPECT_EQ(cut_ptrs[f + 1] - cut_ptrs[f], n_bins_per_feat);
+  }
+
+  EXPECT_TRUE(ellpack->IsDense());
+
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto accessor_var = ellpack->GetHostEllpack(&ctx, &h_gidx_buffer);
+  std::visit(
+      [&](auto&& accessor) {
+        EXPECT_EQ(accessor.row_stride, n_features);
+        EXPECT_EQ(accessor.n_rows, n_samples);
+
+        auto bin_counts = CountBins(accessor, n_bins_per_feat);
+        // Validate histogram index distribution
+        auto n_values_per_bin = n_samples / n_bins_per_feat;
+        auto remainder = n_samples % n_bins_per_feat;
+
+        // Verify expected distribution
+        for (bst_feature_t feat = 0; feat < n_features; ++feat) {
+          for (bst_bin_t bin = 0; bin < n_bins_per_feat; ++bin) {
+            bst_idx_t expected_count = n_values_per_bin;
+            if (bin == n_bins_per_feat - 1) {
+              expected_count += remainder;  // Last bin gets the remainder
+            }
+            EXPECT_EQ(bin_counts[feat][bin], expected_count);
+          }
+        }
+      },
+      accessor_var);
+}
+}  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 1f92d5df8a58..42b92e001da4 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -73,7 +73,7 @@ TEST(Histogram, SubtractionTrack) {
   histogram.AllocateHistograms(&ctx, {0, 1, 2});
   GPUExpandEntry root;
   root.nidx = 0;
-  auto need_build = histogram.SubtractHist(&ctx, {root}, {0}, {1});
+  auto need_build = histogram.SubtractHist<GPUExpandEntry>(&ctx, {root}, {0}, {1});
 
   std::vector<GPUExpandEntry> candidates(2);
   candidates[0].nidx = 1;
diff --git a/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
new file mode 100644
index 000000000000..6e08cc420a4e
--- /dev/null
+++ b/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
@@ -0,0 +1,137 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
+#include "../../../../src/tree/gpu_hist/multi_evaluate_splits.cuh"
+#include "../../helpers.h"
+
+namespace xgboost::tree::cuda_impl {
+class GpuMultiHistEvaluatorBasicTest : public ::testing::Test {
+ public:
+  Context ctx{MakeCUDACtx(0)};
+  bst_target_t n_targets = 2;
+  bst_bin_t n_bins_per_feat_tar = 4;
+
+  dh::device_vector<GradientPairInt64> parent_sum;
+  dh::device_vector<GradientPairInt64> histogram;
+  MultiEvaluateSplitInputs input;
+  dh::device_vector<GradientQuantiser> quantizers;
+  MultiEvaluateSplitSharedInputs shared_inputs;
+
+  dh::device_vector<bst_feature_t> feature_segments;
+  dh::device_vector<float> feature_values{.0f, .1f, .2f, .3f};
+  dh::device_vector<float> min_values{-1.0f};
+
+  void SetUp() override {
+    parent_sum.resize(n_targets);
+    parent_sum[0] = GradientPairInt64{56, 40};
+    parent_sum[1] = GradientPairInt64{96, 128};
+
+    histogram.resize(n_bins_per_feat_tar * n_targets);
+    // first target, dense,                    // 0/0, 56/40
+    histogram[0] = GradientPairInt64{8, 4};    // 8/4, 48/36
+    histogram[2] = GradientPairInt64{12, 8};   // 20/12, 36/28
+    histogram[4] = GradientPairInt64{16, 12};  // 36/24, 20/16
+    histogram[6] = GradientPairInt64{20, 16};  // 56/40, 0/0
+
+    // second target, dense                    // 0/0,  96/128
+    histogram[1] = GradientPairInt64{11, 13};  // 11/13, 85/115
+    histogram[3] = GradientPairInt64{19, 29};  // 30/42, 66/86
+    histogram[5] = GradientPairInt64{27, 45};  // 57/87, 39/41
+    histogram[7] = GradientPairInt64{39, 41};  // 96/128, 0/0
+
+    input.parent_sum = dh::ToSpan(parent_sum);
+    input.histogram = dh::ToSpan(histogram);
+
+    GradientQuantiser quantizer{{1.0, 1.0}, {1.0, 1.0}};
+    quantizers.resize(2, quantizer);
+
+    shared_inputs.roundings = dh::ToSpan(quantizers);
+
+    feature_segments.resize(2);
+    feature_segments[0] = 0;
+    feature_segments[1] = static_cast<bst_feature_t>(n_bins_per_feat_tar);
+    shared_inputs.feature_segments = dh::ToSpan(feature_segments);
+
+    shared_inputs.feature_values = dh::ToSpan(feature_values);
+
+    shared_inputs.min_values = dh::ToSpan(min_values);
+
+    shared_inputs.n_bins_per_feat_tar = n_bins_per_feat_tar;
+    TrainParam param;
+    param.Init(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}, {"learning_rate", "1"}});
+    shared_inputs.param = GPUTrainingParam{param};
+  }
+
+  void TestEmptyHess() {
+    // Turn all Hessian values into 0.
+    thrust::transform(histogram.begin(), histogram.end(), histogram.begin(),
+                      [] XGBOOST_DEVICE(GradientPairInt64 const& bin) {
+                        return GradientPairInt64{bin.GetQuantisedGrad(), 0};
+                      });
+    MultiHistEvaluator evaluator;
+    auto candidate = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs);
+    TrainParam param;
+    param.Init(Args{});
+    ASSERT_FALSE(candidate.IsValid(param, 100));
+    ASSERT_TRUE(candidate.base_weight.empty());
+    ASSERT_TRUE(candidate.left_weight.empty());
+    ASSERT_TRUE(candidate.right_weight.empty());
+    ASSERT_TRUE(candidate.split.node_sum.empty());
+  }
+};
+
+namespace {
+template <typename T, typename V = std::remove_cv_t<T>>
+void CheckSpan(common::Span<T> span, std::vector<V> const& exp) {
+  std::vector<V> h_vec(span.size());
+  dh::CopyDeviceSpanToVector(&h_vec, span);
+  ASSERT_EQ(h_vec.size(), exp.size());
+  for (std::size_t i = 0; i < h_vec.size(); ++i) {
+    if constexpr (std::is_floating_point_v<V>) {
+      ASSERT_NEAR(h_vec[i], exp[i], 1e-5);
+    } else {
+      ASSERT_EQ(h_vec[i], exp[i]);
+    }
+  }
+}
+}  // namespace
+
+TEST_F(GpuMultiHistEvaluatorBasicTest, Root) {
+  using OnePass = MultiEvaluateSplitSharedInputs;
+
+  std::vector<GradientPairInt64> exp_left_sum{{36, 24}, {57, 87}};
+  std::vector<GradientPairInt64> exp_right_sum{{20, 16}, {39, 41}};
+  std::vector<float> exp_base_weight{-1.4, -0.75};
+  std::vector<float> exp_left_weight{-1.5, -0.655172};
+  std::vector<float> exp_right_weight{-1.25, -0.951219};
+
+  for (auto one_pass : {OnePass::kNone, OnePass::kForward, OnePass::kBackward}) {
+    auto shared = this->shared_inputs;
+    shared.one_pass = one_pass;
+    MultiHistEvaluator evaluator;
+    auto candidate = evaluator.EvaluateSingleSplit(&ctx, input, shared);
+    ASSERT_NEAR(candidate.split.loss_chg, 3.04239, 1e-5);
+    CheckSpan(candidate.left_weight, exp_left_weight);
+    CheckSpan(candidate.right_weight, exp_right_weight);
+    CheckSpan(candidate.base_weight, exp_base_weight);
+
+    std::stringstream ss;
+    ss << candidate;
+    auto str = ss.str();
+    if (one_pass != OnePass::kBackward) {
+      ASSERT_NE(str.find("left_sum"), std::string::npos);
+      ASSERT_EQ(str.find("right_sum"), std::string::npos);
+      CheckSpan(candidate.split.node_sum, exp_left_sum);
+    } else {
+      ASSERT_EQ(str.find("left_sum"), std::string::npos);
+      ASSERT_NE(str.find("right_sum"), std::string::npos);
+      CheckSpan(candidate.split.node_sum, exp_right_sum);
+    }
+  }
+}
+
+TEST_F(GpuMultiHistEvaluatorBasicTest, EmptyHess) { this->TestEmptyHess(); }
+}  // namespace xgboost::tree::cuda_impl
diff --git a/tests/cpp/tree/gpu_hist/test_multi_histogram.cu b/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
new file mode 100644
index 000000000000..e59eb40818cc
--- /dev/null
+++ b/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <thrust/sequence.h>
+
+#include "../../../../src/tree/gpu_hist/histogram.cuh"
+#include "../../helpers.h"
+#include "../../histogram_helpers.h"
+
+namespace xgboost::tree::cuda_impl {
+TEST(GpuMultiHistogram, Basic) {
+  auto ctx = MakeCUDACtx(0);
+  bst_bin_t n_bins = 16;
+  bst_target_t n_targets = 2;
+  bst_feature_t n_features = 4;
+
+  bst_idx_t n_samples = 64;
+  auto page = MakeEllpackForTest(&ctx, n_samples, n_features, n_bins);
+
+  auto cuts = page->CutsShared();
+
+  FeatureGroups fg{*cuts, true, std::numeric_limits<std::size_t>::max()};
+  auto fg_acc = fg.DeviceAccessor(ctx.Device());
+
+  DeviceHistogramBuilder histogram;
+  bst_bin_t n_total_bins = n_targets * n_features * n_bins;
+  histogram.Reset(&ctx, /*max_cached_hist_nodes=*/2, fg_acc, n_total_bins, true);
+
+  auto gpairs = linalg::Constant(&ctx, GradientPair{1, 1}, n_samples, n_targets);
+  dh::device_vector<std::uint32_t> ridx(n_samples);
+  thrust::sequence(ctx.CUDACtx()->CTP(), ridx.begin(), ridx.end(), 0);
+
+  histogram.AllocateHistograms(&ctx, {0});
+  auto node_hist = histogram.GetNodeHistogram(0);
+  std::vector<GradientQuantiser> h_quantizers(n_targets, GradientQuantiser{{1.0, 1.0}, {1.0, 1.0}});
+  dh::device_vector<GradientQuantiser> d_quantizers{h_quantizers};
+  histogram.BuildHistogram(ctx.CUDACtx(), page->GetDeviceEllpack(&ctx, {}), fg_acc,
+                           gpairs.View(ctx.Device()), dh::ToSpan(ridx), node_hist,
+                           dh::ToSpan(d_quantizers));
+
+  std::vector<GradientPairInt64> h_node_hist(node_hist.size());
+  dh::CopyDeviceSpanToVector(&h_node_hist, node_hist);
+  // The values are evenly distributed across all bins
+  auto expected = n_samples / n_bins;
+  for (auto v : h_node_hist) {
+    ASSERT_EQ(v.GetQuantisedGrad(), expected);
+    ASSERT_EQ(v.GetQuantisedHess(), expected);
+  }
+}
+}  // namespace xgboost::tree::cuda_impl

From f8f2705b2555fb15addfef99a66e077abd5ba89b Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 30 Oct 2025 14:34:02 -0700
Subject: [PATCH 212/224] Remove _check_glibc check (#11783)

* Remove _check_glibc check

* Fix formatting
---
 python-package/xgboost/core.py | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 655727e39d93..acfa6e8bd3bf 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -76,7 +76,7 @@
     is_pyarrow_available,
     py_str,
 )
-from .libpath import find_lib_path, is_sphinx_build
+from .libpath import find_lib_path
 
 if TYPE_CHECKING:
     from pandas import DataFrame as PdDataFrame
@@ -379,30 +379,6 @@ def build_info() -> dict:
     return res
 
 
-def _check_glibc() -> None:
-    if is_sphinx_build():
-        return
-
-    glibc_ver = build_info().get("GLIBC_VERSION", None)
-    if glibc_ver is not None and (
-        glibc_ver[0] < 2 or glibc_ver[0] == 2 and glibc_ver[1] < 28
-    ):
-        warnings.warn(
-            "Your system has an old version of glibc (< 2.28). We will stop supporting "
-            "Linux distros with glibc older than 2.28 after **May 31, 2025**. "
-            "Please upgrade to a recent Linux distro (with glibc >= 2.28) to use "
-            "future versions of XGBoost.\n"
-            "Note: You have installed the 'manylinux2014' variant of XGBoost. Certain "
-            "features such as GPU algorithms or federated learning are not available. "
-            "To use these features, please upgrade to a recent Linux distro with glibc "
-            "2.28+, and install the 'manylinux_2_28' variant.",
-            FutureWarning,
-        )
-
-
-_check_glibc()
-
-
 def _numpy2ctypes_type(dtype: Type[np.number]) -> Type[CNumeric]:
     _NUMPY_TO_CTYPES_MAPPING: Dict[Type[np.number], Type[CNumeric]] = {
         np.float32: ctypes.c_float,

From b5201b1ac75d8bf080d7c84e2ec0b37733ad92f0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 4 Nov 2025 09:57:47 +0800
Subject: [PATCH 213/224] [mt] Implement partitioning for GPU. (#11789)

---
 .../xgboost/testing/multi_target.py           |  47 ++++
 src/gbm/gbtree.cc                             |   1 -
 src/tree/gpu_hist/evaluate_splits.cuh         |   2 +-
 src/tree/gpu_hist/multi_evaluate_splits.cu    | 214 ++++++++++++------
 src/tree/gpu_hist/multi_evaluate_splits.cuh   |  36 ++-
 src/tree/updater_gpu_hist.cu                  |  54 ++---
 src/tree/updater_gpu_hist.cuh                 | 201 +++++++++++++++-
 tests/python-gpu/test_gpu_multi_target.py     |  11 +
 tests/python-gpu/test_gpu_with_sklearn.py     |   2 +-
 tests/python/test_multi_target.py             |  28 +--
 10 files changed, 455 insertions(+), 141 deletions(-)
 create mode 100644 python-package/xgboost/testing/multi_target.py
 create mode 100644 tests/python-gpu/test_gpu_multi_target.py

diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
new file mode 100644
index 000000000000..4558916fb0b4
--- /dev/null
+++ b/python-package/xgboost/testing/multi_target.py
@@ -0,0 +1,47 @@
+"""Tests for multi-target training."""
+
+from typing import Optional
+
+from sklearn.datasets import make_classification, make_multilabel_classification
+
+import xgboost.testing as tm
+
+from ..sklearn import XGBClassifier
+from .updater import ResetStrategy
+from .utils import Device
+
+
+def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
+    """Use vector leaf for multi-class models."""
+    X, y = make_classification(128, n_features=12, n_informative=10, n_classes=4)
+    clf = XGBClassifier(
+        multi_strategy="multi_output_tree",
+        callbacks=[ResetStrategy()],
+        n_estimators=10,
+        device=device,
+        learning_rate=learning_rate,
+    )
+    clf.fit(X, y, eval_set=[(X, y)])
+    assert clf.objective == "multi:softprob"
+    assert tm.non_increasing(clf.evals_result()["validation_0"]["mlogloss"])
+
+    proba = clf.predict_proba(X)
+    assert proba.shape == (y.shape[0], 4)
+
+
+def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
+    """Use vector leaf for multi-label classification models."""
+    X, y = make_multilabel_classification(128)
+    clf = XGBClassifier(
+        multi_strategy="multi_output_tree",
+        callbacks=[ResetStrategy()],
+        n_estimators=10,
+        device=device,
+        learning_rate=learning_rate,
+    )
+    clf.fit(X, y, eval_set=[(X, y)])
+    assert clf.objective == "binary:logistic"
+    assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
+
+    proba = clf.predict_proba(X)
+    assert proba.shape == y.shape
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 53b7e664eb3c..05f4fd0d14ff 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -216,7 +216,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
     CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
         << "Only the hist tree method is supported for building multi-target trees with vector "
            "leaf.";
-    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
   }
 
   TreesOneIter new_trees;
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 7a7d9414306d..3e8d819a7bab 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -208,7 +208,7 @@ class GPUHistEvaluator {
 struct MultiEvaluateSplitInputs {
   bst_node_t nidx;
   bst_node_t depth;
-  common::Span<GradientPairInt64> parent_sum;
+  common::Span<GradientPairInt64 const> parent_sum;
   common::Span<const GradientPairInt64> histogram;
 };
 
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index c6d0df60aaa2..a6a9e3b24b72 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -15,8 +15,13 @@
 
 namespace xgboost::tree::cuda_impl {
 namespace {
-__device__ bst_bin_t RevBinIdx(bst_bin_t gidx_end, bst_bin_t bin_idx) {
-  return gidx_end - bin_idx - 1;
+/**
+ * @brief Calculate the gradient index for the reverse pass
+ *
+ * @note All inputs are global across features.
+ */
+__device__ bst_bin_t RevBinIdx(bst_bin_t gidx_begin, bst_bin_t gidx_end, bst_bin_t bin_idx) {
+  return gidx_begin + (gidx_end - bin_idx - 1);
 }
 
 // Scan the histogram in 2 dim for all nodes
@@ -60,7 +65,7 @@ struct ScanHistogramAgent {
   __device__ void Backward(common::Span<GradientPairInt64 const> node_histogram,
                            common::Span<GradientPairInt64> scan_result, bst_target_t t) {
     this->ScanFeature(node_histogram, scan_result, t,
-                      [&](bst_bin_t bin_idx) { return RevBinIdx(gidx_end, bin_idx); });
+                      [&](bst_bin_t bin_idx) { return RevBinIdx(gidx_begin, gidx_end, bin_idx); });
   }
 };
 }  // namespace
@@ -112,7 +117,7 @@ struct EvaluateSplitAgent {
   template <std::int32_t d_step>
   __device__ void Numerical(MultiEvaluateSplitInputs const &node,
                             MultiEvaluateSplitSharedInputs const &shared,
-                            common::Span<GradientPairInt64 const> f_scan,
+                            common::Span<GradientPairInt64 const> node_scan,
                             MultiSplitCandidate *best_split) {
     static_assert(d_step == +1 || d_step == -1, "Invalid step.");
     // Calculate split gain for each bin
@@ -130,7 +135,7 @@ struct EvaluateSplitAgent {
       double gain = thread_active ? 0 : kNullGain;
 
       if (thread_active) {
-        auto scan_bin = f_scan.subspan(bin_idx * n_targets, n_targets);
+        auto scan_bin = node_scan.subspan(bin_idx * n_targets, n_targets);
         for (bst_target_t t = 0; t < n_targets; ++t) {
           auto pg = roundings[t].ToFloatingPoint(node.parent_sum[t]);
           // left
@@ -155,7 +160,7 @@ struct EvaluateSplitAgent {
         // Update
         bst_bin_t split_gidx = bin_idx;
         if (d_step == -1) {
-          split_gidx = RevBinIdx(gidx_end, bin_idx);
+          split_gidx = RevBinIdx(gidx_begin, gidx_end, bin_idx);
         }
         float min_fvalue = shared.min_values[fidx];
         float fvalue;
@@ -168,7 +173,7 @@ struct EvaluateSplitAgent {
             fvalue = shared.feature_values[split_gidx - 1];
           }
         }
-        auto scan_bin = f_scan.subspan(bin_idx * n_targets, n_targets);
+        auto scan_bin = node_scan.subspan(bin_idx * n_targets, n_targets);
         // Missing values go to right in the forward pass, go to left in the backward pass.
         best_split->Update(gain, d_step == 1 ? kRightDir : kLeftDir, fvalue, fidx, scan_bin, false,
                            shared.param, shared.roundings);
@@ -190,30 +195,42 @@ __global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
   using AgentT = EvaluateSplitAgent<kBlockThreads>;
   __shared__ typename AgentT::TempStorage temp_storage;
 
-  auto fidx = blockIdx.x;
-  EvaluateSplitAgent<kBlockThreads> agent{&temp_storage, blockIdx.x};
+  const auto nidx = blockIdx.x / shared.Features();
+  bst_feature_t fidx = blockIdx.x % shared.Features();
+  AgentT agent{&temp_storage, fidx};
 
   auto n_targets = shared.Targets();
   // The number of bins in a feature
   auto f_hist_size =
       (shared.feature_segments[fidx + 1] - shared.feature_segments[fidx]) * n_targets;
-  // TODO(jiamingy): Support more than a single node
+
+  auto candidate_idx = nidx * shared.Features() + fidx;
 
   if (shared.one_pass != MultiEvaluateSplitSharedInputs::kBackward) {
-    auto forward = bin_scans[0].subspan(0, nodes[0].histogram.size());
-    auto f_scan = forward.subspan(shared.feature_segments[fidx] * n_targets, f_hist_size);
-    agent.template Numerical<+1>(nodes[0], shared, f_scan, &out_candidates[fidx]);
+    auto forward = bin_scans[nidx].subspan(0, nodes[nidx].histogram.size());
+    agent.template Numerical<+1>(nodes[nidx], shared, forward, &out_candidates[candidate_idx]);
   }
   if (shared.one_pass != MultiEvaluateSplitSharedInputs::kForward) {
-    auto backward = bin_scans[0].subspan(nodes[0].histogram.size(), nodes[0].histogram.size());
-    auto f_scan = backward.subspan(shared.feature_segments[fidx] * n_targets, f_hist_size);
-    agent.template Numerical<-1>(nodes[0], shared, f_scan, &out_candidates[fidx]);
+    auto backward =
+        bin_scans[nidx].subspan(nodes[nidx].histogram.size(), nodes[nidx].histogram.size());
+    agent.template Numerical<-1>(nodes[nidx], shared, backward, &out_candidates[candidate_idx]);
   }
 }
 
 [[nodiscard]] MultiExpandEntry MultiHistEvaluator::EvaluateSingleSplit(
-    Context const *ctx, MultiEvaluateSplitInputs input,
-    MultiEvaluateSplitSharedInputs shared_inputs) {
+    Context const *ctx, MultiEvaluateSplitInputs const &input,
+    MultiEvaluateSplitSharedInputs const &shared_inputs) {
+  dh::device_vector<MultiEvaluateSplitInputs> inputs{input};
+  dh::device_vector<MultiExpandEntry> outputs(1);
+
+  this->EvaluateSplits(ctx, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(outputs));
+  return outputs[0];
+}
+
+void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
+                                        common::Span<MultiEvaluateSplitInputs const> d_inputs,
+                                        MultiEvaluateSplitSharedInputs const &shared_inputs,
+                                        common::Span<MultiExpandEntry> out_splits) {
   auto n_targets = shared_inputs.Targets();
   CHECK_GE(n_targets, 2);
   auto n_bins_per_feat_tar = shared_inputs.n_bins_per_feat_tar;
@@ -221,49 +238,77 @@ __global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
   auto n_features = shared_inputs.Features();
   CHECK_GE(n_features, 1);
 
-  dh::device_vector<MultiEvaluateSplitInputs> inputs{input};
+  std::uint32_t n_nodes = d_inputs.size();
+  CHECK_EQ(n_nodes, out_splits.size());
+
+  if (n_nodes == 0) {
+    return;
+  }
+
+  // Calculate total scan buffer size needed for all nodes
+  auto node_hist_size = n_targets * n_features * n_bins_per_feat_tar;
+  std::size_t total_hist_size = node_hist_size * n_nodes;
 
   // Scan the histograms. One for forward and the other for backward.
-  this->scan_buffer_.resize(input.histogram.size() * 2);
+  this->scan_buffer_.resize(total_hist_size * 2);
   thrust::fill(ctx->CUDACtx()->CTP(), this->scan_buffer_.begin(), this->scan_buffer_.end(),
                GradientPairInt64{});
-  dh::device_vector<common::Span<GradientPairInt64>> scans{dh::ToSpan(this->scan_buffer_)};
-  std::uint32_t n_nodes = 1;
+
+  // Create spans for each node's scan results
+  dh::device_vector<common::Span<GradientPairInt64>> scans(n_nodes);
+  for (std::size_t nidx_in_set = 0; nidx_in_set < n_nodes; ++nidx_in_set) {
+    scans[nidx_in_set] = dh::ToSpan(this->scan_buffer_)
+                             .subspan(nidx_in_set * node_hist_size * 2, node_hist_size * 2);
+  }
+
+  // Launch histogram scan kernel
   dim3 grid{n_nodes, n_features, n_targets};
   std::uint32_t constexpr kBlockThreads = 32;
   dh::LaunchKernel{grid, kBlockThreads}(  // NOLINT
-      ScanHistogramKernel<kBlockThreads>, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(scans));
+      ScanHistogramKernel<kBlockThreads>, d_inputs, shared_inputs, dh::ToSpan(scans));
 
-  dh::device_vector<MultiSplitCandidate> d_splits(n_features);
-  dh::LaunchKernel{n_features, kBlockThreads, 0, ctx->CUDACtx()->Stream()}(  // NOLINT
-      EvaluateSplitsKernel<kBlockThreads>, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(scans),
+  // Launch split evaluation kernel
+  dh::device_vector<MultiSplitCandidate> d_splits(n_nodes * n_features);
+  dh::LaunchKernel{n_nodes * n_features, kBlockThreads, 0, ctx->CUDACtx()->Stream()}(  // NOLINT
+      EvaluateSplitsKernel<kBlockThreads>, d_inputs, shared_inputs, dh::ToSpan(scans),
       dh::ToSpan(d_splits));
 
-  auto best_split = thrust::reduce(
-      ctx->CUDACtx()->CTP(), d_splits.cbegin(), d_splits.cend(), MultiSplitCandidate{},
-      [] XGBOOST_DEVICE(MultiSplitCandidate const &lhs, MultiSplitCandidate const &rhs)
-          -> MultiSplitCandidate { return lhs.loss_chg > rhs.loss_chg ? lhs : rhs; });
+  // Find best split for each node
+  this->weights_.resize(n_nodes * n_targets * 3);
+  auto d_weights = dh::ToSpan(this->weights_);
 
-  if (best_split.node_sum.empty()) {
-    return {};
-  }
+  dh::CachingDeviceUVector<float> d_parent_gains(n_nodes);
+  dh::CachingDeviceUVector<std::int32_t> sum_zeros(n_nodes * 2);
 
-  // Calculate leaf weights from gradient sum
-  this->weights_.resize(n_targets * 3);
-  auto d_weights = dh::ToSpan(this->weights_);
-  auto base_weight = d_weights.subspan(0, n_targets);
-  auto left_weight = d_weights.subspan(n_targets, n_targets);
-  auto right_weight = d_weights.subspan(n_targets * 2, n_targets);
+  auto s_parent_gains = dh::ToSpan(d_parent_gains);
+  auto s_sum_zeros = dh::ToSpan(sum_zeros);
+  auto s_d_splits = dh::ToSpan(d_splits);
 
-  dh::CachingDeviceUVector<float> d_parent_gain(1);
-  dh::CachingDeviceUVector<std::int32_t> sum_zero(2);
+  // Process results for each node
+  dh::LaunchN(n_nodes, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t nidx_in_set) {
+    auto input = d_inputs[nidx_in_set];
 
-  auto s_pg = dh::ToSpan(d_parent_gain);
-  auto s_sum_zero = dh::ToSpan(sum_zero);
+    // Find best split among all features for this node
+    MultiSplitCandidate best_split{};
+    for (bst_feature_t f = 0; f < n_features; ++f) {
+      auto candidate = s_d_splits[nidx_in_set * n_features + f];
+      if (candidate.loss_chg > best_split.loss_chg) {
+        best_split = candidate;
+      }
+    }
+
+    if (best_split.node_sum.empty()) {
+      // Invalid split
+      out_splits[nidx_in_set] = {};
+      return;
+    }
+
+    // Calculate weights for this node
+    auto base_weight = d_weights.subspan(nidx_in_set * n_targets * 3, n_targets);
+    auto left_weight = d_weights.subspan(nidx_in_set * n_targets * 3 + n_targets, n_targets);
+    auto right_weight = d_weights.subspan(nidx_in_set * n_targets * 3 + n_targets * 2, n_targets);
 
-  dh::LaunchN(inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(std::size_t i) {
     auto d_roundings = shared_inputs.roundings;
-    // the data inside the split candidates references the scan result.
     auto node_sum = best_split.node_sum;
 
     float parent_gain = 0;
@@ -276,15 +321,15 @@ __global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
       base_weight[t] = CalcWeight(shared_inputs.param, g.GetGrad(), g.GetHess());
       parent_gain += -base_weight[t] * ThresholdL1(g.GetGrad(), shared_inputs.param.reg_alpha);
     }
-    s_pg[0] = parent_gain;
+    s_parent_gains[nidx_in_set] = parent_gain;
 
     bool l = true, r = true;
     for (bst_target_t t = 0; t < n_targets; ++t) {
       auto quantizer = d_roundings[t];
       auto sibling_sum = input.parent_sum[t] - node_sum[t];
 
-      l = l && (node_sum[t].GetQuantisedHess() - .0 == .0);
-      r = r && (sibling_sum.GetQuantisedHess() - .0 == .0);
+      l = l && (node_sum[t].GetQuantisedHess() == 0);
+      r = r && (sibling_sum.GetQuantisedHess() == 0);
 
       if (best_split.dir == kRightDir) {
         // forward pass, node_sum is the left sum
@@ -299,31 +344,59 @@ __global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
         auto lg = quantizer.ToFloatingPoint(sibling_sum);
         left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
       }
+    }
+
+    s_sum_zeros[nidx_in_set * 2] = l;
+    s_sum_zeros[nidx_in_set * 2 + 1] = r;
+
+    // Set up the output entry
+    out_splits[nidx_in_set] = {input.nidx,  input.depth, best_split,
+                               base_weight, left_weight, right_weight};
+    out_splits[nidx_in_set].split.loss_chg -= parent_gain;
 
-      s_sum_zero[0] = l;
-      s_sum_zero[1] = r;
+    if (l || r) {
+      out_splits[nidx_in_set] = {};
     }
   });
-  // Copy the result back to the host.
-  float parent_gain = 0;
-  dh::safe_cuda(cudaMemcpyAsync(&parent_gain, d_parent_gain.data(), sizeof(parent_gain),
-                                cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
-  best_split.loss_chg -= parent_gain;
-
-  std::vector<std::int32_t> h_sum_zero(s_sum_zero.size());
-  dh::safe_cuda(cudaMemcpyAsync(h_sum_zero.data(), s_sum_zero.data(), s_sum_zero.size_bytes(),
-                                cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
-  if (h_sum_zero[0] || h_sum_zero[1]) {
-    return {};
-  }
+}
+
+void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tree,
+                                        MultiExpandEntry const &candidate) {
+  auto n_targets = p_tree->NumTargets();
+
+  auto left_child = p_tree->LeftChild(candidate.nidx);
+  auto right_child = p_tree->RightChild(candidate.nidx);
+  bst_node_t max_node = std::max(left_child, right_child);
+  this->AllocNodeSum(max_node, n_targets);
+
+  auto parent_sum = this->GetNodeSum(candidate.nidx, n_targets);
 
-  MultiExpandEntry entry{input.nidx,  input.depth, best_split,
-                         base_weight, left_weight, right_weight};
-  return entry;
+  auto left_sum = this->GetNodeSum(left_child, n_targets);
+  auto right_sum = this->GetNodeSum(right_child, n_targets);
+
+  // Calculate node sums
+  // TODO(jiamingy): We need to batch the targets and nodes
+  auto best_split = candidate.split;
+  auto node_sum = best_split.node_sum;
+  dh::LaunchN(1, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t) {
+    for (bst_target_t t = 0; t < n_targets; ++t) {
+      auto sibling_sum = parent_sum[t] - node_sum[t];
+      if (best_split.dir == kRightDir) {
+        // forward pass, node_sum is the left sum
+        left_sum[t] = node_sum[t];
+        right_sum[t] = sibling_sum;
+      } else {
+        // backward pass, node_sum is the right sum
+        right_sum[t] = node_sum[t];
+        left_sum[t] = sibling_sum;
+      }
+    }
+  });
 }
 
-void DebugPrintHistogram(common::Span<GradientPairInt64 const> node_hist,
-                         common::Span<GradientQuantiser const> roundings, bst_target_t n_targets) {
+std::ostream &DebugPrintHistogram(std::ostream &os, common::Span<GradientPairInt64 const> node_hist,
+                                  common::Span<GradientQuantiser const> roundings,
+                                  bst_target_t n_targets) {
   std::vector<GradientQuantiser> h_roundings;
   thrust::copy(dh::tcbegin(roundings), dh::tcend(roundings), std::back_inserter(h_roundings));
   dh::CopyDeviceSpanToVector(&h_roundings, roundings);
@@ -331,11 +404,12 @@ void DebugPrintHistogram(common::Span<GradientPairInt64 const> node_hist,
   std::vector<GradientPairInt64> h_node_hist(node_hist.size());
   dh::CopyDeviceSpanToVector(&h_node_hist, node_hist);
   for (bst_target_t t = 0; t < n_targets; ++t) {
-    std::cout << "target:" << t << std::endl;
+    os << "Target:" << t << std::endl;
     for (std::size_t i = t; i < h_node_hist.size() / n_targets; i += n_targets) {
-      std::cout << h_roundings[t].ToFloatingPoint(h_node_hist[i]) << ", ";
+      os << h_roundings[t].ToFloatingPoint(h_node_hist[i]) << ", ";
     }
-    std::cout << std::endl;
+    os << std::endl;
   }
+  return os;
 }
 }  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh
index c2cddb8906e2..3df0b1f736b5 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cuh
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cuh
@@ -14,13 +14,39 @@ class MultiHistEvaluator {
   dh::device_vector<float> weights_;
 
   dh::device_vector<GradientPairInt64> scan_buffer_;
+  dh::device_vector<GradientPairInt64> node_sums_;
 
  public:
-  [[nodiscard]] MultiExpandEntry EvaluateSingleSplit(Context const *ctx,
-                                                     MultiEvaluateSplitInputs input,
-                                                     MultiEvaluateSplitSharedInputs shared_inputs);
+  [[nodiscard]] MultiExpandEntry EvaluateSingleSplit(
+      Context const *ctx, MultiEvaluateSplitInputs const &input,
+      MultiEvaluateSplitSharedInputs const &shared_inputs);
+
+  void EvaluateSplits(Context const *ctx, common::Span<MultiEvaluateSplitInputs const> d_inputs,
+                      MultiEvaluateSplitSharedInputs const &shared_inputs,
+                      common::Span<MultiExpandEntry> out_splits);
+
+  void AllocNodeSum(bst_node_t nidx, bst_target_t n_targets) {
+    auto end = (nidx + 1) * n_targets;
+    if (this->node_sums_.size() < end) {
+      this->node_sums_.resize(end);
+    }
+  }
+  [[nodiscard]] common::Span<GradientPairInt64> GetNodeSum(bst_node_t nidx,
+                                                           bst_target_t n_targets) {
+    auto offset = nidx * n_targets;
+    return dh::ToSpan(this->node_sums_).subspan(offset, n_targets);
+  }
+  [[nodiscard]] common::Span<GradientPairInt64 const> GetNodeSum(bst_node_t nidx,
+                                                                 bst_target_t n_targets) const {
+    auto offset = nidx * n_targets;
+    return dh::ToSpan(this->node_sums_).subspan(offset, n_targets);
+  }
+
+  // Track the child gradient sum.
+  void ApplyTreeSplit(Context const *ctx, RegTree const *p_tree, MultiExpandEntry const &candidate);
 };
 
-void DebugPrintHistogram(common::Span<GradientPairInt64 const> node_hist,
-                         common::Span<GradientQuantiser const> roundings, bst_target_t n_targets);
+std::ostream &DebugPrintHistogram(std::ostream &os, common::Span<GradientPairInt64 const> node_hist,
+                                  common::Span<GradientQuantiser const> roundings,
+                                  bst_target_t n_targets);
 }  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 27a7e2b5c7dd..be639d013cf1 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -312,8 +312,7 @@ struct GPUHistMakerDevice {
       right_sampled_features->SetDevice(ctx_->Device());
       feature_sets.emplace_back(right_sampled_features);
       common::Span<bst_feature_t> right_feature_set =
-          interaction_constraints.Query(right_sampled_features->DeviceSpan(),
-                                        right_nidx);
+          interaction_constraints.Query(right_sampled_features->DeviceSpan(), right_nidx);
       h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
                               left_feature_set, histogram_.GetNodeHistogram(left_nidx)};
       h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
@@ -324,15 +323,14 @@ struct GPUHistMakerDevice {
       max_active_features =
           std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
     }
-    dh::safe_cuda(cudaMemcpyAsync(
-        d_node_inputs.data().get(), h_node_inputs.data(),
-        h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
+                                  h_node_inputs.size() * sizeof(EvaluateSplitInputs),
+                                  cudaMemcpyDefault));
 
     this->evaluator_.EvaluateSplits(ctx_, nidx, max_active_features, dh::ToSpan(d_node_inputs),
                                     shared_inputs, dh::ToSpan(entries));
-    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
-                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
-                                  cudaMemcpyDeviceToHost));
+    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(),
+                                  sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
     this->monitor.Stop(__func__);
   }
 
@@ -488,15 +486,6 @@ struct GPUHistMakerDevice {
     return n_samples * kNeedCopyThreshold > n_total_samples;
   }
 
-  template <typename Accessor>
-  struct GoLeftWrapperOp {
-    GoLeftOp<Accessor> go_left;
-    __device__ bool operator()(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
-                               const NodeSplitData& data) const {
-      return go_left(ridx, data);
-    }
-  };
-
   // Update position and build histogram. We merge these two functions for external
   // memory, where we want to bundle as many computation as possible for each data read.
   void PartitionAndBuildHist(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& expand_set,
@@ -529,7 +518,8 @@ struct GPUHistMakerDevice {
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
       page.Impl()->Visit(ctx_, {}, [&](auto&& d_matrix) {
         using Acc = std::remove_reference_t<decltype(d_matrix)>;
-        auto go_left = GoLeftOp<Acc>{d_matrix};
+        using GoLeft = GoLeftOp<Acc>;
+        auto go_left = GoLeft{d_matrix};
 
         // Partition histogram.
         monitor.Start("UpdatePositionBatch");
@@ -539,7 +529,7 @@ struct GPUHistMakerDevice {
         } else {
           partitioners_.at(k)->UpdatePositionBatch(ctx_, nodes.nidx, nodes.left_nidx,
                                                    nodes.right_nidx, nodes.split_data,
-                                                   GoLeftWrapperOp<Acc>{go_left});
+                                                   cuda_impl::GoLeftWrapperOp<GoLeft>{go_left});
         }
         monitor.Stop("UpdatePositionBatch");
 
@@ -697,10 +687,8 @@ struct GPUHistMakerDevice {
     auto right_weight = candidate.right_weight * param.learning_rate;
     auto parent_hess =
         quantiser->ToFloatingPoint(candidate.split.left_sum + candidate.split.right_sum).GetHess();
-    auto left_hess =
-        quantiser->ToFloatingPoint(candidate.split.left_sum).GetHess();
-    auto right_hess =
-        quantiser->ToFloatingPoint(candidate.split.right_sum).GetHess();
+    auto left_hess = quantiser->ToFloatingPoint(candidate.split.left_sum).GetHess();
+    auto right_hess = quantiser->ToFloatingPoint(candidate.split.right_sum).GetHess();
 
     auto is_cat = candidate.split.is_cat;
     if (is_cat) {
@@ -714,16 +702,15 @@ struct GPUHistMakerDevice {
       CHECK_LE(split_cats.size(), h_cats.size());
       std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
 
-      tree.ExpandCategorical(
-          candidate.nidx, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
-          base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_hess,
-          left_hess, right_hess);
+      tree.ExpandCategorical(candidate.nidx, candidate.split.findex, split_cats,
+                             candidate.split.dir == kLeftDir, base_weight, left_weight,
+                             right_weight, candidate.split.loss_chg, parent_hess, left_hess,
+                             right_hess);
     } else {
       CHECK(!common::CheckNAN(candidate.split.fvalue));
       tree.ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
                       candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight,
-                      candidate.split.loss_chg, parent_hess,
-          left_hess, right_hess);
+                      candidate.split.loss_chg, parent_hess, left_hess, right_hess);
     }
     evaluator_.ApplyTreeSplit(candidate, p_tree);
 
@@ -791,8 +778,7 @@ struct GPUHistMakerDevice {
                    [&](auto const& e) { return driver.IsChildValid(e); });
 
       // Allocaate children nodes.
-      auto new_candidates =
-          pinned.GetSpan<GPUExpandEntry>(valid_candidates.size() * 2, GPUExpandEntry());
+      auto new_candidates = pinned.GetSpan(valid_candidates.size() * 2, GPUExpandEntry{});
 
       this->PartitionAndBuildHist(p_fmat, expand_set, valid_candidates, p_tree);
 
@@ -937,8 +923,6 @@ class GPUHistMaker : public TreeUpdater {
     return result;
   }
 
-
-
   [[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; }
   [[nodiscard]] bool HasNodePosition() const override { return true; }
 
@@ -962,9 +946,7 @@ class GPUHistMaker : public TreeUpdater {
 
 XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
     .describe("Grow tree with GPU.")
-    .set_body([](Context const* ctx, ObjInfo const* task) {
-      return new GPUHistMaker(ctx, task);
-    });
+    .set_body([](Context const* ctx, ObjInfo const* task) { return new GPUHistMaker(ctx, task); });
 
 class GPUGlobalApproxMaker : public TreeUpdater {
  public:
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index 5e11c9a43501..47275b20fe6b 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -14,6 +14,7 @@
 #include "gpu_hist/multi_evaluate_splits.cuh"  // for MultiHistEvaluator
 #include "gpu_hist/row_partitioner.cuh"        // for RowPartitioner
 #include "hist/hist_param.h"                   // for HistMakerTrainParam
+#include "tree_view.h"                         // for MultiTargetTreeView
 #include "xgboost/base.h"                      // for bst_idx_t
 #include "xgboost/context.h"                   // for Context
 #include "xgboost/host_device_vector.h"        // for HostDeviceVector
@@ -24,6 +25,16 @@ namespace xgboost::tree::cuda_impl {
 inline constexpr std::size_t kMaxNodeBatchSize = 1024;
 using xgboost::cuda_impl::StaticBatch;
 
+template <typename GoLeftOp>
+struct GoLeftWrapperOp {
+  GoLeftOp go_left;
+  template <typename NodeSplitData>
+  __device__ bool operator()(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
+                             const NodeSplitData& data) const {
+    return go_left(ridx, data);
+  }
+};
+
 /**
  * @brief Implementation for vector leaf.
  */
@@ -45,10 +56,12 @@ class MultiTargetHistMaker {
   linalg::Matrix<GradientPair> dh_gpair_;
   std::vector<bst_idx_t> const batch_ptr_;
 
-  void BuildHist(EllpackPage const& page, bst_node_t nidx) {
+  dh::PinnedMemory pinned_;
+
+  void BuildHist(EllpackPage const& page, std::int32_t k, bst_node_t nidx) {
     auto d_gpair = this->dh_gpair_.View(this->ctx_->Device());
     CHECK(!this->partitioners_.empty());
-    auto d_ridx = this->partitioners_.front()->GetRows(nidx);
+    auto d_ridx = this->partitioners_.at(k)->GetRows(nidx);
     auto hist = histogram_.GetNodeHistogram(nidx);
     auto roundings = this->quantiser_->Quantizers();
     auto acc = page.Impl()->GetDeviceEllpack(this->ctx_, {});
@@ -101,6 +114,7 @@ class MultiTargetHistMaker {
     auto n_samples = d_gpair.Shape(0);
     auto n_targets = d_gpair.Shape(1);
 
+    // Calculate the root sum
     dh::device_vector<GradientPairInt64> root_sum(n_targets);
 
     auto key_it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
@@ -118,13 +132,17 @@ class MultiTargetHistMaker {
     thrust::reduce_by_key(ctx_->CUDACtx()->CTP(), key_it, key_it + d_gpair.Size(), val_it,
                           thrust::make_discard_iterator(), root_sum.begin());
 
+    // Build the root histogram.
     histogram_.AllocateHistograms(ctx_, {RegTree::kRoot});
 
     CHECK_EQ(p_fmat->NumBatches(), this->partitioners_.size());
+    std::int32_t k = 0;
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-      this->BuildHist(page, RegTree::kRoot);
+      this->BuildHist(page, k, RegTree::kRoot);
+      ++k;
     }
 
+    // Evaluate root split
     auto node_hist = this->histogram_.GetNodeHistogram(RegTree::kRoot);
     MultiEvaluateSplitInputs input{RegTree::kRoot, p_tree->GetDepth(RegTree::kRoot),
                                    dh::ToSpan(root_sum), node_hist};
@@ -156,20 +174,197 @@ class MultiTargetHistMaker {
     p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
                        candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight),
                        linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight));
+
+    this->evaluator_.ApplyTreeSplit(this->ctx_, p_tree, candidate);
+  }
+
+  struct NodeSplitData {
+    bst_node_t nidx;
+  };
+
+  struct PartitionNodes {
+    std::vector<bst_node_t> nidx;
+    std::vector<bst_node_t> left_nidx;
+    std::vector<bst_node_t> right_nidx;
+    std::vector<NodeSplitData> split_data;
+
+    explicit PartitionNodes(std::size_t n_candidates)
+        : nidx(n_candidates),
+          left_nidx(n_candidates),
+          right_nidx(n_candidates),
+          split_data(n_candidates) {}
+  };
+
+  PartitionNodes CreatePartitionNodes(RegTree const* p_tree,
+                                      std::vector<MultiExpandEntry> const& candidates) {
+    PartitionNodes nodes(candidates.size());
+    auto tree = p_tree->HostMtView();
+    for (std::size_t i = 0, n = candidates.size(); i < n; i++) {
+      auto const& e = candidates[i];
+      auto split_type = tree.SplitType(e.nidx);
+      nodes.nidx.at(i) = e.nidx;
+      nodes.left_nidx[i] = tree.LeftChild(e.nidx);
+      nodes.right_nidx[i] = tree.RightChild(e.nidx);
+      nodes.split_data[i] = NodeSplitData{e.nidx};
+
+      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
+    }
+    return nodes;
+  }
+
+  // TODO(jiamingy): Merge this with the single target version. Make sure copying tree
+  // data doesn't block external memory execution.
+  template <typename Accessor>
+  struct GoLeftOp {
+    Accessor d_matrix;
+    MultiTargetTreeView tree;
+    __device__ bool operator()(cuda_impl::RowIndexT ridx, NodeSplitData const& data) const {
+      // given a row index, returns the node id it belongs to
+      float cut_value = d_matrix.GetFvalue(ridx, tree.SplitIndex(data.nidx));
+      // Missing value
+      bool go_left = true;
+      if (isnan(cut_value)) {
+        go_left = tree.DefaultLeft(data.nidx);
+      } else {
+        if (tree.SplitType(data.nidx) == FeatureType::kCategorical) {
+          go_left = common::Decision(tree.NodeCats(data.nidx), cut_value);
+        } else {
+          go_left = cut_value <= tree.SplitCond(data.nidx);
+        }
+      }
+      return go_left;
+    }
+  };
+
+  void PartitionAndBuildHist(DMatrix* p_fmat, std::vector<MultiExpandEntry> const& expand_set,
+                             std::vector<MultiExpandEntry> const& candidates,
+                             RegTree const* p_tree) {
+    if (expand_set.empty()) {
+      return;
+    }
+    CHECK_LE(candidates.size(), expand_set.size());
+    // TODO(jiamingy): Implement finalize partition.
+
+    // Prepare for update partition
+    auto nodes = this->CreatePartitionNodes(p_tree, expand_set);
+    auto mt_tree = p_tree->HostMtView();
+    // TODO(jiamingy): subtraction trick
+    std::vector<bst_node_t> build_nidx;
+    for (auto const& nidx_in_set : expand_set) {
+      auto left_child = mt_tree.LeftChild(nidx_in_set.nidx);
+      auto right_child = mt_tree.RightChild(nidx_in_set.nidx);
+      build_nidx.emplace_back(left_child);
+      build_nidx.emplace_back(right_child);
+    }
+
+    histogram_.AllocateHistograms(ctx_, build_nidx);
+    // Use a device view.
+    mt_tree = MultiTargetTreeView{this->ctx_->Device(), p_tree};
+
+    std::int32_t k{0};
+    // TODO(jiamingy): Support external memory.
+    bool prefetch_copy = true;
+    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
+      page.Impl()->Visit(ctx_, {}, [&](auto&& d_acc) {
+        using Acc = std::remove_reference_t<decltype(d_acc)>;
+        using GoLeft = GoLeftOp<Acc>;
+        auto go_left = GoLeft{d_acc, mt_tree};
+        partitioners_.at(k)->UpdatePositionBatch(ctx_, nodes.nidx, nodes.left_nidx,
+                                                 nodes.right_nidx, nodes.split_data,
+                                                 GoLeftWrapperOp<GoLeft>{go_left});
+
+        for (auto nidx : build_nidx) {
+          this->BuildHist(page, k, nidx);
+        }
+      });
+      ++k;
+    }
+  }
+
+  void EvaluateSplits(std::vector<MultiExpandEntry> const& candidates, RegTree const& tree,
+                      common::Span<MultiExpandEntry> pinned_candidates_out) {
+    if (candidates.empty()) {
+      return;
+    }
+    GPUTrainingParam param{this->param_};
+    MultiEvaluateSplitSharedInputs shared_inputs{
+        this->quantiser_->Quantizers(),
+        this->cuts_->cut_ptrs_.ConstDeviceSpan(),
+        this->cuts_->cut_values_.ConstDeviceSpan(),
+        this->cuts_->min_vals_.ConstDeviceSpan(),
+        this->param_.max_bin,
+        param,
+    };
+
+    dh::device_vector<MultiEvaluateSplitInputs> inputs(2 * candidates.size());
+    dh::device_vector<MultiExpandEntry> outputs(2 * candidates.size());
+
+    auto mt_tree = tree.HostMtView();
+    std::vector<MultiEvaluateSplitInputs> h_node_inputs(candidates.size() * 2);
+    bst_node_t max_nidx = 0;
+    for (auto const& candidate : candidates) {
+      bst_node_t left_nidx = mt_tree.LeftChild(candidate.nidx);
+      bst_node_t right_nidx = mt_tree.RightChild(candidate.nidx);
+      max_nidx = std::max({max_nidx, left_nidx, right_nidx});
+    }
+
+    for (std::size_t i = 0; i < candidates.size(); i++) {
+      auto candidate = candidates.at(i);
+      bst_node_t left_nidx = mt_tree.LeftChild(candidate.nidx);
+      bst_node_t right_nidx = mt_tree.RightChild(candidate.nidx);
+      // Make sure no allocation is happening.
+      // The parent sum is calculated in the last apply tree split.
+      auto parent_sum = this->evaluator_.GetNodeSum(candidate.nidx, mt_tree.NumTargets());
+      auto left = MultiEvaluateSplitInputs{left_nidx, candidate.depth + 1, parent_sum,
+                                           histogram_.GetNodeHistogram(left_nidx)};
+      auto right = MultiEvaluateSplitInputs{right_nidx, candidate.depth + 1, parent_sum,
+                                            histogram_.GetNodeHistogram(right_nidx)};
+      h_node_inputs[i * 2] = left;
+      h_node_inputs[i * 2 + 1] = right;
+    }
+    dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), h_node_inputs.data(),
+                                  common::SizeBytes<MultiEvaluateSplitInputs>(h_node_inputs.size()),
+                                  cudaMemcpyDefault, ctx_->CUDACtx()->Stream()));
+    this->evaluator_.EvaluateSplits(this->ctx_, dh::ToSpan(inputs), shared_inputs,
+                                    dh::ToSpan(outputs));
+    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), outputs.data().get(),
+                                  pinned_candidates_out.size_bytes(), cudaMemcpyDefault,
+                                  ctx_->CUDACtx()->Stream()));
   }
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const*,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>*) {
+    if (this->param_.learning_rate - 1.0 != 0.0) {
+      LOG(FATAL) << "GPU" << MTNotImplemented();
+    }
     Driver<MultiExpandEntry> driver{param_, kMaxNodeBatchSize};
 
     this->Reset(gpair_all, p_fmat, p_tree);
     driver.Push({this->InitRoot(p_fmat, p_tree)});
+
     // The set of leaves that can be expanded asynchronously
     auto expand_set = driver.Pop();
     while (!expand_set.empty()) {
       for (auto& candidate : expand_set) {
         this->ApplySplit(candidate, p_tree);
       }
+
+      // Get the candidates we are allowed to expand further
+      // e.g. We do not bother further processing nodes whose children are beyond max depth
+      std::vector<MultiExpandEntry> valid_candidates;
+      std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(valid_candidates),
+                   [&](auto const& e) { return driver.IsChildValid(e); });
+
+      // Allocate children nodes.
+      auto new_candidates = pinned_.GetSpan(valid_candidates.size() * 2, MultiExpandEntry{});
+
+      this->PartitionAndBuildHist(p_fmat, expand_set, valid_candidates, p_tree);
+
+      this->EvaluateSplits(valid_candidates, *p_tree, new_candidates);
+      this->ctx_->CUDACtx()->Stream().Sync();
+
+      driver.Push(new_candidates.begin(), new_candidates.end());
+
       expand_set = driver.Pop();
     }
   }
diff --git a/tests/python-gpu/test_gpu_multi_target.py b/tests/python-gpu/test_gpu_multi_target.py
new file mode 100644
index 000000000000..70acdf34924e
--- /dev/null
+++ b/tests/python-gpu/test_gpu_multi_target.py
@@ -0,0 +1,11 @@
+from xgboost.testing.multi_target import run_multiclass, run_multilabel
+
+
+def test_multiclass() -> None:
+    # learning_rate is not yet supported.
+    run_multiclass("cuda", 1.0)
+
+
+def test_multilabel() -> None:
+    # learning_rate is not yet supported.
+    run_multilabel("cuda", 1.0)
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index d72875deaee8..31fb7223d2a9 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -216,7 +216,7 @@ def test_custom_objective(
             clf.fit(X, y)
         return
     if strategy == "multi_output_tree" and device == "cuda":
-        with pytest.raises(ValueError, match=r"GPU is not yet"):
+        with pytest.raises(ValueError, match=r"GPU support for multi-target tree"):
             clf.fit(X, y)
         return
 
diff --git a/tests/python/test_multi_target.py b/tests/python/test_multi_target.py
index 446d440627fd..1619f1eaf189 100644
--- a/tests/python/test_multi_target.py
+++ b/tests/python/test_multi_target.py
@@ -2,15 +2,15 @@
 
 from hypothesis import given, note, settings, strategies
 
-import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.multi_target import run_multiclass, run_multilabel
 from xgboost.testing.params import (
     exact_parameter_strategy,
     hist_cache_strategy,
     hist_multi_parameter_strategy,
     hist_parameter_strategy,
 )
-from xgboost.testing.updater import ResetStrategy, train_result
+from xgboost.testing.updater import train_result
 
 
 class TestTreeMethodMulti:
@@ -78,28 +78,8 @@ def test_hist(
 
 
 def test_multiclass() -> None:
-    X, y = tm.datasets.make_classification(
-        128, n_features=12, n_informative=10, n_classes=4
-    )
-    clf = xgb.XGBClassifier(
-        multi_strategy="multi_output_tree", callbacks=[ResetStrategy()], n_estimators=10
-    )
-    clf.fit(X, y, eval_set=[(X, y)])
-    assert clf.objective == "multi:softprob"
-    assert tm.non_increasing(clf.evals_result()["validation_0"]["mlogloss"])
-
-    proba = clf.predict_proba(X)
-    assert proba.shape == (y.shape[0], 4)
+    run_multiclass("cpu", None)
 
 
 def test_multilabel() -> None:
-    X, y = tm.datasets.make_multilabel_classification(128)
-    clf = xgb.XGBClassifier(
-        multi_strategy="multi_output_tree", callbacks=[ResetStrategy()], n_estimators=10
-    )
-    clf.fit(X, y, eval_set=[(X, y)])
-    assert clf.objective == "binary:logistic"
-    assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
-
-    proba = clf.predict_proba(X)
-    assert proba.shape == y.shape
+    run_multilabel("cpu", None)

From c5ba21d915090e95dc5e6f9103421435f6de7c6d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 4 Nov 2025 16:01:09 +0800
Subject: [PATCH 214/224] [doc] Document the categories container, remove JSON
 schema. (#11788)

---
 doc/model.schema                           | 555 ---------------------
 doc/tutorials/saving_model.rst             |  73 ++-
 include/xgboost/json.h                     |   6 +-
 ops/conda_env/aarch64_test.yml             |   2 -
 ops/conda_env/linux_cpu_test.yml           |   1 -
 ops/conda_env/macos_cpu_test.yml           |   1 -
 ops/conda_env/win64_test.yml               |   1 -
 python-package/xgboost/testing/__init__.py |   4 -
 tests/python/test_basic_models.py          |  38 --
 tests/python/test_model_io.py              |  38 --
 10 files changed, 62 insertions(+), 657 deletions(-)
 delete mode 100644 doc/model.schema

diff --git a/doc/model.schema b/doc/model.schema
deleted file mode 100644
index 103d9d9e4221..000000000000
--- a/doc/model.schema
+++ /dev/null
@@ -1,555 +0,0 @@
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "definitions": {
-    "gbtree": {
-      "type": "object",
-      "properties": {
-        "name": {
-          "const": "gbtree"
-        },
-        "model": {
-          "type": "object",
-          "properties": {
-            "gbtree_model_param": {
-              "$ref": "#/definitions/gbtree_model_param"
-            },
-            "trees": {
-              "type": "array",
-              "items": {
-                "type": "object",
-                "properties": {
-                  "tree_param": {
-                    "$ref": "#/definitions/tree_param"
-                  },
-                  "id": {
-                    "type": "integer"
-                  },
-                  "loss_changes": {
-                    "type": "array",
-                    "items": {
-                      "type": "number"
-                    }
-                  },
-                  "sum_hessian": {
-                    "type": "array",
-                    "items": {
-                      "type": "number"
-                    }
-                  },
-                  "base_weights": {
-                    "type": "array",
-                    "items": {
-                      "type": "number"
-                    }
-                  },
-                  "left_children": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "right_children": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "parents": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "split_indices": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "split_conditions": {
-                    "type": "array",
-                    "items": {
-                      "type": "number"
-                    }
-                  },
-                  "split_type": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "default_left": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "categories": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "categories_nodes": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "categories_segments": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  },
-                  "categories_sizes": {
-                    "type": "array",
-                    "items": {
-                      "type": "integer"
-                    }
-                  }
-                },
-                "required": [
-                  "tree_param",
-                  "loss_changes",
-                  "sum_hessian",
-                  "base_weights",
-                  "left_children",
-                  "right_children",
-                  "parents",
-                  "split_indices",
-                  "split_conditions",
-                  "default_left",
-                  "categories",
-                  "categories_nodes",
-                  "categories_segments",
-                  "categories_sizes"
-                ]
-              }
-            },
-            "tree_info": {
-              "type": "array",
-              "items": {
-                "type": "integer"
-              }
-            }
-          },
-          "required": [
-            "gbtree_model_param",
-            "trees",
-            "tree_info"
-          ]
-        }
-      },
-      "required": [
-        "name",
-        "model"
-      ]
-    },
-    "gbtree_model_param": {
-      "type": "object",
-      "properties": {
-        "num_trees": {
-          "type": "string"
-        },
-        "num_parallel_tree": {
-          "type": "string"
-        }
-      },
-      "required": [
-        "num_trees",
-        "num_parallel_tree"
-      ]
-    },
-    "tree_param": {
-      "type": "object",
-      "properties": {
-        "num_nodes": {
-          "type": "string"
-        },
-        "size_leaf_vector": {
-          "type": "string"
-        },
-        "num_feature": {
-          "type": "string"
-        }
-      },
-      "required": [
-        "num_nodes",
-        "num_feature",
-        "size_leaf_vector"
-      ]
-    },
-    "reg_loss_param": {
-      "type": "object",
-      "properties": {
-        "scale_pos_weight": {
-          "type": "string"
-        }
-      }
-    },
-    "pseudo_huber_param": {
-      "type": "object",
-      "properties": {
-        "huber_slope": {
-          "type": "string"
-        }
-      }
-    },
-    "aft_loss_param": {
-      "type": "object",
-      "properties": {
-        "aft_loss_distribution": {
-          "type": "string"
-        },
-        "aft_loss_distribution_scale": {
-          "type": "string"
-        }
-      }
-    },
-    "softmax_multiclass_param": {
-      "type": "object",
-      "properties": {
-        "num_class": { "type": "string" }
-      }
-    },
-    "lambda_rank_param": {
-      "type": "object",
-      "properties": {
-        "num_pairsample": { "type": "string" },
-        "fix_list_weight": { "type": "string" }
-      }
-    },
-    "lambdarank_param": {
-      "type": "object",
-      "properties": {
-        "lambdarank_num_pair_per_sample": { "type": "string" },
-        "lambdarank_pair_method": { "type": "string" },
-        "lambdarank_unbiased": {"type": "string" },
-        "lambdarank_bias_norm": {"type": "string" },
-        "ndcg_exp_gain": {"type": "string"}
-      }
-    }
-  },
-  "type": "object",
-  "properties": {
-    "version": {
-      "type": "array",
-      "items": [
-        {
-          "type": "number",
-          "minimum": 1
-        },
-        {
-          "type": "number",
-          "minimum": 0
-        },
-        {
-          "type": "number",
-          "minimum": 0
-        }
-      ],
-      "minItems": 3,
-      "maxItems": 3
-    },
-    "learner": {
-      "type": "object",
-      "properties": {
-        "feature_names": {
-          "type": "array",
-          "items": {
-              "type": "string"
-          }
-        },
-        "feature_types": {
-          "type": "array",
-          "items": {
-              "type": "string"
-          }
-        },
-        "gradient_booster": {
-          "oneOf": [
-            {
-              "$ref": "#/definitions/gbtree"
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "gblinear" },
-                "model": {
-                  "type": "object",
-                  "properties": {
-                    "weights": {
-                      "type": "array",
-                      "items": {
-                        "type": "number"
-                      }
-                    }
-                  }
-                }
-              }
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "dart" },
-                "gbtree": {
-                  "$ref": "#/definitions/gbtree"
-                },
-                "weight_drop": {
-                  "type": "array",
-                  "items": {
-                    "type": "number"
-                  }
-                }
-              },
-              "required": [
-                "name",
-                "gbtree",
-                "weight_drop"
-              ]
-            }
-          ]
-        },
-
-        "objective": {
-          "oneOf": [
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "reg:squarederror" },
-                "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
-              },
-              "required": [
-                "name",
-                "reg_loss_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "reg:pseudohubererror" },
-                "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
-              },
-              "required": [
-                "name",
-                "reg_loss_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "reg:squaredlogerror" },
-                "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
-              },
-              "required": [
-                "name",
-                "reg_loss_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "reg:linear" },
-                "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
-              },
-              "required": [
-                "name",
-                "reg_loss_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "reg:logistic" },
-                "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
-              },
-              "required": [
-                "name",
-                "reg_loss_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "binary:logistic" },
-                "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
-              },
-              "required": [
-                "name",
-                "reg_loss_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "binary:logitraw" },
-                "reg_loss_param": { "$ref": "#/definitions/reg_loss_param"}
-              },
-              "required": [
-                "name",
-                "reg_loss_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "count:poisson" },
-                "poisson_regression_param": {
-                  "type": "object",
-                  "properties": {
-                    "max_delta_step": { "type": "string" }
-                  }
-                }
-              },
-              "required": [
-                "name",
-                "poisson_regression_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "reg:tweedie" },
-                "tweedie_regression_param": {
-                  "type": "object",
-                  "properties": {
-                    "tweedie_variance_power": { "type": "string" }
-                  }
-                }
-              },
-              "required": [
-                "name",
-                "tweedie_regression_param"
-              ]
-            },
-            {
-              "properties": {
-                "name": {
-                  "const": "reg:absoluteerror"
-                }
-              },
-              "type": "object"
-            },
-            {
-              "properties": {
-                "name": {
-                  "const": "reg:quantileerror"
-                },
-                "quantile_loss_param": {
-                  "type": "object",
-                  "properties": {
-                    "quantle_alpha": {"type": "array"}
-                  }
-                }
-              },
-              "type": "object"
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "survival:cox" }
-              },
-              "required": [ "name" ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "reg:gamma" }
-              },
-              "required": [ "name" ]
-            },
-
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "multi:softprob" },
-                "softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"}
-              },
-              "required": [
-                "name",
-                "softmax_multiclass_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "multi:softmax" },
-                "softmax_multiclass_param": { "$ref": "#/definitions/softmax_multiclass_param"}
-              },
-              "required": [
-                "name",
-                "softmax_multiclass_param"
-              ]
-            },
-
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "rank:pairwise" },
-                "lambda_rank_param": { "$ref": "#/definitions/lambdarank_param"}
-              },
-              "required": [
-                "name",
-                "lambdarank_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "rank:ndcg" },
-                "lambda_rank_param": { "$ref": "#/definitions/lambdarank_param"}
-              },
-              "required": [
-                "name",
-                "lambdarank_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": { "const": "rank:map" },
-                "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
-              },
-              "required": [
-                "name",
-                "lambda_rank_param"
-              ]
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": {"const": "survival:aft"},
-                "aft_loss_param": { "$ref": "#/definitions/aft_loss_param"}
-              }
-            },
-            {
-              "type": "object",
-              "properties": {
-                "name": {"const": "binary:hinge"}
-              }
-            }
-          ]
-        },
-
-        "learner_model_param": {
-          "type": "object",
-          "properties": {
-            "base_score": { "type": "string" },
-            "num_class": { "type": "string" },
-            "num_feature": { "type": "string" },
-            "num_target": { "type": "string" }
-          }
-        }
-      },
-      "required": [
-        "gradient_booster",
-        "objective"
-      ]
-    }
-  },
-  "required": [
-    "version",
-    "learner"
-  ]
-}
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index 6449135b8874..20aeb210c465 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -2,6 +2,12 @@
 Introduction to Model IO
 ########################
 
+**Contents**
+
+.. contents::
+  :backlinks: none
+  :local:
+
 Since 2.1.0, the default model format for XGBoost is the UBJSON format, the option is
 enabled for serializing models to file, serializing models to buffer, and for memory
 snapshot (pickle and alike).
@@ -229,25 +235,58 @@ Difference between saving model and dumping model
 XGBoost has a function called ``dump_model`` in the Booster class, which lets you to
 export the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz).  The
 primary use case for it is for model interpretation and visualization, and is not supposed
-to be loaded back to XGBoost.  The JSON version has a `schema
-<https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`__.  See next section for
-more info.
+to be loaded back to XGBoost.
+
+**********
+Categories
+**********
+
+Since 3.1, the categories encoding from a training dataframe is stored in the booster to
+provide test-time re-coding support, see :ref:`cat-recode` for more info about how the
+re-coder works. We will briefly explain the JSON format for the serialized category index.
+
+The categories are saved in a JSON object named "cats" under the gbtree model. It contains
+three keys:
+
+- feature_segments
+
+This is a CSR-like pointer that stores the number of categories for each feature. It
+starts with zero and ends with the total number of categories from all features. For
+example:
+
+.. code-block:: python
+
+    feature_segments = [0, 3, 3, 5]
+
+The ``feature_segments`` list represents a dataset with two categorical features and one
+numerical feature. The first feature contains three categories, the second feature is
+numerical and thus has no categories, and the last feature includes two categories.
+
+- sorted_idx
 
-***********
-JSON Schema
-***********
+This array stores the sorted indices (`argsort`) of categories across all features,
+segmented by the ``feature_segments``. Given a feature with categories: ``["b", "c",
+"a"]``, the sorted index is ``[2, 0, 1]``.
 
-Another important feature of JSON format is a documented `schema
-<https://json-schema.org/>`__, based on which one can easily reuse the output model from
-XGBoost.  Here is the JSON schema for the output model (not serialization, which will not
-be stable as noted above).  For an example of parsing XGBoost tree model, see
-``/demo/json-model``.  Please notice the "weight_drop" field used in "dart" booster.
-XGBoost does not scale tree leaf directly, instead it saves the weights as a separated
-array.
+- enc
 
-.. include:: ../model.schema
-   :code: json
+This is an array with a length equal to the number of features, storing all the categories
+in the same order as the input dataframe. The storage schema depends on whether the
+categories are strings (XGBoost also supports numerical categories, such as integers). For
+string categories, we use a schema similar to the arrow format for a string array. The
+categories of each feature are represented by two arrays, namely ``offsets`` and
+``values``. The format is also similar to a CSR-matrix. The ``values`` field is a
+``uint8`` array storing characters from all category names. Given a feature with three
+categories: ``["bb", "c", "a"]``, the ``values`` field is ``[98, 98, 99, 97]``. Then the
+``offsets`` segments the ``values`` array similar to a CSR pointer: ``[0, 2, 3, 4]``. We
+chose to not store the ``values`` as a JSON string to avoid handling special characters
+and string encoding. The string names are stored exactly as given by the dataframe.
 
+As for numerical categories, the ``enc`` contains two keys: ``type`` and ``values``. The
+``type`` field is an integer ID that identifies the type of the categories, such as 64-bit
+integers and 32-bit floating points (note that they are all f32 inside a decision
+tree). The exact mapping between the type to the integer ID is internal but stable. The
+``values`` is an array storing all categories in a feature.
 
 *************
 Brief History
@@ -258,4 +297,6 @@ Brief History
 - Later in XGBoost 1.6.0, additional support for Universal Binary JSON was introduced as
   an optimization for more efficient model IO.
 - UBJSON has been set to default in 2.1.
-- The old binary format was removed in 3.1.
\ No newline at end of file
+- The old binary format was removed in 3.1.
+- The JSON schema file is no longer maintained and has been removed in 3.2. The underlying
+  schema of the model is not changed.
\ No newline at end of file
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index 9eee6b393ce3..14db1ad13c20 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -30,7 +30,11 @@ class Value {
   }
 
  public:
-  /*!\brief Simplified implementation of LLVM RTTI. */
+  /**
+   * @brief Simplified implementation of LLVM RTTI.
+   *
+   * @note The integer ID must be kept stable.
+   */
   enum class ValueKind : std::int64_t {
     kString = 0,
     kNumber = 1,
diff --git a/ops/conda_env/aarch64_test.yml b/ops/conda_env/aarch64_test.yml
index d7dd13639ff3..02e0e97a3eb8 100644
--- a/ops/conda_env/aarch64_test.yml
+++ b/ops/conda_env/aarch64_test.yml
@@ -21,8 +21,6 @@ dependencies:
 - cmake
 - ninja
 - boto3
-- jsonschema
-- boto3
 - awscli
 - numba
 - llvmlite
diff --git a/ops/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml
index 51d68e82c1bd..9a298ad399ed 100644
--- a/ops/conda_env/linux_cpu_test.yml
+++ b/ops/conda_env/linux_cpu_test.yml
@@ -30,7 +30,6 @@ dependencies:
 - pytest-cov
 - python-kubernetes
 - urllib3
-- jsonschema
 - boto3
 - awscli
 - py-ubjson
diff --git a/ops/conda_env/macos_cpu_test.yml b/ops/conda_env/macos_cpu_test.yml
index 390abf141803..2518aa5b2157 100644
--- a/ops/conda_env/macos_cpu_test.yml
+++ b/ops/conda_env/macos_cpu_test.yml
@@ -24,7 +24,6 @@ dependencies:
 - pytest-timeout
 - python-kubernetes
 - urllib3
-- jsonschema
 - boto3
 - awscli
 - loky>=3.5.1
diff --git a/ops/conda_env/win64_test.yml b/ops/conda_env/win64_test.yml
index 6e87e1560c21..57e8ad0656c3 100644
--- a/ops/conda_env/win64_test.yml
+++ b/ops/conda_env/win64_test.yml
@@ -11,7 +11,6 @@ dependencies:
 - pytest
 - boto3
 - hypothesis
-- jsonschema
 - cupy>=13.2
 - python-graphviz
 - pip
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 1fd49db89533..765d6ff0443f 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -190,10 +190,6 @@ def no_dask_cudf() -> PytestSkip:
     return no_mod("dask_cudf")
 
 
-def no_json_schema() -> PytestSkip:
-    return no_mod("jsonschema")
-
-
 def no_graphviz() -> PytestSkip:
     return no_mod("graphviz")
 
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index b08833d4958d..5f4b616c9a40 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -245,44 +245,6 @@ def test_feature_names_validation(self):
         bst = xgb.train([], dm2)
         bst.predict(dm2)  # success
 
-    @pytest.mark.skipif(**tm.no_json_schema())
-    def test_json_dump_schema(self):
-        import jsonschema
-
-        def validate_model(parameters):
-            X = np.random.random((100, 30))
-            y = np.random.randint(0, 4, size=(100,))
-
-            parameters["num_class"] = 4
-            m = xgb.DMatrix(X, y)
-
-            booster = xgb.train(parameters, m)
-            dump = booster.get_dump(dump_format="json")
-
-            for i in range(len(dump)):
-                jsonschema.validate(instance=json.loads(dump[i]), schema=schema)
-
-        path = os.path.dirname(
-            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        )
-        doc = os.path.join(path, "doc", "dump.schema")
-        with open(doc, "r") as fd:
-            schema = json.load(fd)
-
-        parameters = {
-            "tree_method": "hist",
-            "booster": "gbtree",
-            "objective": "multi:softmax",
-        }
-        validate_model(parameters)
-
-        parameters = {
-            "tree_method": "hist",
-            "booster": "dart",
-            "objective": "multi:softmax",
-        }
-        validate_model(parameters)
-
     def test_special_model_dump_characters(self) -> None:
         params = {"objective": "reg:squarederror", "max_depth": 3}
         feature_names = ['"feature 0"', "\tfeature\n1", """feature "2"."""]
diff --git a/tests/python/test_model_io.py b/tests/python/test_model_io.py
index 1ac21fd8a8fb..f5c4674afc14 100644
--- a/tests/python/test_model_io.py
+++ b/tests/python/test_model_io.py
@@ -122,44 +122,6 @@ def test_categorical_model_io(self) -> None:
             predt_1 = booster.predict(Xy)
             np.testing.assert_allclose(predt_0, predt_1)
 
-    @pytest.mark.skipif(**tm.no_json_schema())
-    def test_json_io_schema(self) -> None:
-        import jsonschema
-
-        model_path = "test_json_schema.json"
-        path = os.path.dirname(
-            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        )
-        doc = os.path.join(path, "doc", "model.schema")
-        with open(doc, "r") as fd:
-            schema = json.load(fd)
-        parameters = {"tree_method": "hist", "booster": "gbtree"}
-        jsonschema.validate(instance=json_model(model_path, parameters), schema=schema)
-        os.remove(model_path)
-
-        parameters = {"tree_method": "hist", "booster": "dart"}
-        jsonschema.validate(instance=json_model(model_path, parameters), schema=schema)
-        os.remove(model_path)
-
-        try:
-            dtrain, _ = tm.load_agaricus(__file__)
-            xgb.train({"objective": "foo"}, dtrain, num_boost_round=1)
-        except ValueError as e:
-            e_str = str(e)
-            beg = e_str.find("Objective candidate")
-            end = e_str.find("Stack trace")
-            e_str = e_str[beg:end]
-            e_str = e_str.strip()
-            splited = e_str.splitlines()
-            objectives = [s.split(": ")[1] for s in splited]
-            j_objectives = schema["properties"]["learner"]["properties"]["objective"][
-                "oneOf"
-            ]
-            objectives_from_schema = set()
-            for j_obj in j_objectives:
-                objectives_from_schema.add(j_obj["properties"]["name"]["const"])
-            assert set(objectives) == objectives_from_schema
-
     def test_with_pathlib(self) -> None:
         """Saving and loading model files from paths."""
         save_path = Path("model.ubj")

From 948ddc4559553773ca3af315ebe9b7ba14e152ac Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 5 Nov 2025 12:23:13 +0800
Subject: [PATCH 215/224] [mt] Small improvements for the demo. (#11791)

---
 demo/guide-python/multioutput_regression.py | 44 ++++++++++++---------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py
index a577d0a56e22..f641e4c9b784 100644
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@@ -16,23 +16,22 @@
 """
 
 import argparse
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
+import matplotlib
 import numpy as np
 from matplotlib import pyplot as plt
 
 import xgboost as xgb
 
 
-def plot_predt(y: np.ndarray, y_predt: np.ndarray, name: str) -> None:
+def plot_predt(
+    y: np.ndarray, y_predt: np.ndarray, name: str, ax: matplotlib.axes.Axes
+) -> None:
     s = 25
-    plt.scatter(y[:, 0], y[:, 1], c="navy", s=s, edgecolor="black", label="data")
-    plt.scatter(
-        y_predt[:, 0], y_predt[:, 1], c="cornflowerblue", s=s, edgecolor="black"
-    )
-    plt.xlim([-1, 2])
-    plt.ylim([-1, 2])
-    plt.show()
+    ax.scatter(y[:, 0], y[:, 1], c="navy", s=s, edgecolor="black", label=name)
+    ax.scatter(y_predt[:, 0], y_predt[:, 1], c="cornflowerblue", s=s, edgecolor="black")
+    ax.legend()
 
 
 def gen_circle() -> Tuple[np.ndarray, np.ndarray]:
@@ -46,7 +45,7 @@ def gen_circle() -> Tuple[np.ndarray, np.ndarray]:
     return X, y
 
 
-def rmse_model(plot_result: bool, strategy: str) -> None:
+def rmse_model(strategy: str, ax: Optional[matplotlib.axes.Axes]) -> None:
     """Draw a circle with 2-dim coordinate as target variables."""
     X, y = gen_circle()
     # Train a regressor on it
@@ -61,11 +60,11 @@ def rmse_model(plot_result: bool, strategy: str) -> None:
     reg.fit(X, y, eval_set=[(X, y)])
 
     y_predt = reg.predict(X)
-    if plot_result:
-        plot_predt(y, y_predt, "multi")
+    if ax:
+        plot_predt(y, y_predt, f"RMSE-{strategy}", ax)
 
 
-def custom_rmse_model(plot_result: bool, strategy: str) -> None:
+def custom_rmse_model(strategy: str, ax: Optional[matplotlib.axes.Axes]) -> None:
     """Train using Python implementation of Squared Error."""
 
     def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
@@ -111,8 +110,8 @@ def rmse(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
     )
 
     y_predt = booster.inplace_predict(X)
-    if plot_result:
-        plot_predt(y, y_predt, "multi")
+    if ax:
+        plot_predt(y, y_predt, f"PyRMSE-{strategy}", ax)
 
     np.testing.assert_allclose(
         results["Train"]["rmse"], results["Train"]["PyRMSE"], rtol=1e-2
@@ -123,17 +122,24 @@ def rmse(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
     parser = argparse.ArgumentParser()
     parser.add_argument("--plot", choices=[0, 1], type=int, default=1)
     args = parser.parse_args()
+    if args.plot == 1:
+        _, axs = plt.subplots(2, 2)
+    else:
+        axs = np.full(shape=(2, 2), fill_value=None)
+    assert isinstance(axs, np.ndarray)
 
     # Train with builtin RMSE objective
     # - One model per output.
-    rmse_model(args.plot == 1, "one_output_per_tree")
+    rmse_model("one_output_per_tree", axs[0, 0])
     # - One model for all outputs, this is still working in progress, many features are
     # missing.
-    rmse_model(args.plot == 1, "multi_output_tree")
+    rmse_model("multi_output_tree", axs[0, 1])
 
     # Train with custom objective.
     # - One model per output.
-    custom_rmse_model(args.plot == 1, "one_output_per_tree")
+    custom_rmse_model("one_output_per_tree", axs[1, 0])
     # - One model for all outputs, this is still working in progress, many features are
     # missing.
-    custom_rmse_model(args.plot == 1, "multi_output_tree")
+    custom_rmse_model("multi_output_tree", axs[1, 1])
+    if args.plot == 1:
+        plt.show()

From 8d4ffb0b2d080b818282706a5d9080add6f7ebd9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 14:22:19 +0800
Subject: [PATCH 216/224] Avoid deprecated functions in thrust. (#11785)

---
 src/collective/coll.cu                      | 33 ++++++-------
 src/common/algorithm.cuh                    | 51 +++++++++++----------
 src/common/device_helpers.cuh               | 33 ++++++-------
 src/common/quantile.cuh                     |  7 ++-
 src/data/ellpack_page.cu                    | 13 +++---
 src/encoder/ordinal.cuh                     | 17 +++----
 src/metric/auc.cu                           | 31 ++++---------
 src/tree/gpu_hist/gradient_based_sampler.cu |  9 ++--
 tests/cpp/common/test_device_helpers.cu     |  7 +--
 9 files changed, 99 insertions(+), 102 deletions(-)

diff --git a/src/collective/coll.cu b/src/collective/coll.cu
index 2e708592426b..d327e03ba29b 100644
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -2,16 +2,17 @@
  * Copyright 2023-2025, XGBoost Contributors
  */
 #if defined(XGBOOST_USE_NCCL)
-#include <chrono>       // for chrono, chrono_literals
-#include <cstddef>      // for size_t
-#include <cstdint>      // for int8_t, int64_t
-#include <future>       // for future, future_status
-#include <memory>       // for shared_ptr
-#include <mutex>        // for mutex, unique_lock
-#include <string>       // for string
-#include <thread>       // for this_thread
-#include <type_traits>  // for invoke_result_t, is_same_v, enable_if_t
-#include <utility>      // for move
+#include <chrono>               // for chrono, chrono_literals
+#include <cstddef>              // for size_t
+#include <cstdint>              // for int8_t, int64_t
+#include <functional>           // for bit_and, bit_or, bit_xor
+#include <future>               // for future, future_status
+#include <memory>               // for shared_ptr
+#include <mutex>                // for mutex, unique_lock
+#include <string>               // for string
+#include <thread>               // for this_thread
+#include <type_traits>          // for invoke_result_t, is_same_v, enable_if_t
+#include <utility>              // for move
 
 #include "../common/cuda_stream.h"       // for StreamRef, Event
 #include "../common/device_helpers.cuh"  // for device_vector
@@ -211,16 +212,16 @@ void RunBitwiseAllreduce(curt::StreamRef stream, common::Span<std::int8_t> out_b
   // Then reduce locally.
   switch (op) {
     case Op::kBitwiseAND:
-      RunBitwiseAllreduce(pcomm->Stream(), data, device_buffer, thrust::bit_and<std::int8_t>(),
-                          pcomm->World(), data.size());
+      RunBitwiseAllreduce(pcomm->Stream(), data, device_buffer, std::bit_and{}, pcomm->World(),
+                          data.size());
       break;
     case Op::kBitwiseOR:
-      RunBitwiseAllreduce(pcomm->Stream(), data, device_buffer, thrust::bit_or<std::int8_t>(),
-                          pcomm->World(), data.size());
+      RunBitwiseAllreduce(pcomm->Stream(), data, device_buffer, std::bit_or{}, pcomm->World(),
+                          data.size());
       break;
     case Op::kBitwiseXOR:
-      RunBitwiseAllreduce(pcomm->Stream(), data, device_buffer, thrust::bit_xor<std::int8_t>(),
-                          pcomm->World(), data.size());
+      RunBitwiseAllreduce(pcomm->Stream(), data, device_buffer, std::bit_xor{}, pcomm->World(),
+                          data.size());
       break;
     default:
       LOG(FATAL) << "Not a bitwise reduce operation.";
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index 2fa48734c9a5..2309b39fbe84 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -7,14 +7,17 @@
 #include <thrust/copy.h>                        // for copy
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/sort.h>                        // for stable_sort_by_key
-#include <thrust/tuple.h>                       // for tuple, get
 
-#include <cstddef>      // size_t
-#include <cstdint>      // int32_t
-#include <cub/cub.cuh>  // DispatchSegmentedRadixSort,NullType,DoubleBuffer
-#include <iterator>     // distance
-#include <limits>       // numeric_limits
-#include <type_traits>  // conditional_t,remove_const_t
+#include <cstddef>                                      // size_t
+#include <cstdint>                                      // int32_t
+#include <cub/device/device_run_length_encode.cuh>      // for DeviceRunLengthEncode
+#include <cub/device/dispatch/dispatch_radix_sort.cuh>  // for DispatchSegmentedRadixSort
+#include <cub/util_type.cuh>                            // for NullType, DoubleBuffer
+#include <cuda/std/tuple>                               // for tuple
+#include <functional>                                   // for plus, logical_and
+#include <iterator>                                     // for distance
+#include <limits>                                       // for numeric_limits
+#include <type_traits>                                  // for conditional_t,remove_const_t
 
 #include "common.h"            // safe_cuda
 #include "cuda_context.cuh"    // CUDAContext
@@ -175,30 +178,30 @@ template <typename SegIt, typename ValIt>
 void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, ValIt val_begin,
                            ValIt val_end, dh::device_vector<std::size_t> *p_sorted_idx) {
   auto cuctx = ctx->CUDACtx();
-  using Tup = thrust::tuple<std::int32_t, float>;
+  using Tup = cuda::std::tuple<std::int32_t, float>;
   auto &sorted_idx = *p_sorted_idx;
   std::size_t n = std::distance(val_begin, val_end);
   sorted_idx.resize(n);
   dh::Iota(dh::ToSpan(sorted_idx), cuctx->Stream());
   dh::device_vector<Tup> keys(sorted_idx.size());
-  auto key_it = dh::MakeTransformIterator<Tup>(thrust::make_counting_iterator(0ul),
-                                               [=] XGBOOST_DEVICE(std::size_t i) -> Tup {
-                                                 std::int32_t seg_idx;
-                                                 if (i < *seg_begin) {
-                                                   seg_idx = -1;
-                                                 } else {
-                                                   seg_idx = dh::SegmentId(seg_begin, seg_end, i);
-                                                 }
-                                                 auto residue = val_begin[i];
-                                                 return thrust::make_tuple(seg_idx, residue);
-                                               });
+  auto key_it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) -> Tup {
+    std::int32_t seg_idx;
+    if (i < *seg_begin) {
+      seg_idx = -1;
+    } else {
+      seg_idx = dh::SegmentId(seg_begin, seg_end, i);
+    }
+    auto residue = val_begin[i];
+    return cuda::std::make_tuple(seg_idx, residue);
+  });
   thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin());
   thrust::stable_sort_by_key(cuctx->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
                              [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) {
-                               if (thrust::get<0>(l) != thrust::get<0>(r)) {
-                                 return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
+                               if (cuda::std::get<0>(l) != cuda::std::get<0>(r)) {
+                                 // segment index
+                                 return cuda::std::get<0>(l) < cuda::std::get<0>(r);
                                }
-                               return thrust::get<1>(l) < thrust::get<1>(r);  // residue
+                               return cuda::std::get<1>(l) < cuda::std::get<1>(r);  // residue
                              });
 }
 
@@ -331,7 +334,7 @@ template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
 void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out,
                   OffsetT num_items) {
 #if CUB_VERSION >= 300000
-  InclusiveScan(ctx, d_in, d_out, cuda::std::plus{}, num_items);
+  InclusiveScan(ctx, d_in, d_out, std::plus{}, num_items);
 #else
   InclusiveScan(ctx, d_in, d_out, cub::Sum{}, num_items);
 #endif
@@ -370,7 +373,7 @@ AllOf(Policy policy, InputIt first, InputIt second, Chk &&check) {
   auto n = std::distance(first, second);
   auto it =
       dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) { return check(first[i]); });
-  return dh::Reduce(policy, it, it + n, true, thrust::logical_and<>{});
+  return dh::Reduce(policy, it, it + n, true, std::logical_and<>{});
 }
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index d55261ec3de0..8cc936419856 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -7,6 +7,7 @@
 #include <thrust/device_vector.h>                       // for device_vector
 #include <thrust/execution_policy.h>                    // thrust::seq
 #include <thrust/iterator/discard_iterator.h>           // for discard_iterator
+#include <thrust/iterator/reverse_iterator.h>           // for make_reverse_iterator
 #include <thrust/iterator/transform_output_iterator.h>  // make_transform_output_iterator
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
@@ -17,6 +18,7 @@
 #include <cub/cub.cuh>
 #include <cub/util_type.cuh>  // for UnitWord, DoubleBuffer
 #include <cuda/std/iterator>  // for iterator_traits
+#include <functional>         // for equal_to
 #include <variant>            // for variant, visit
 #include <vector>             // for vector
 
@@ -612,11 +614,11 @@ struct SegmentedUniqueReduceOp {
  * \return Number of unique values in total.
  */
 template <typename DerivedPolicy, typename KeyInIt, typename KeyOutIt, typename ValInIt,
-          typename ValOutIt, typename CompValue, typename CompKey = thrust::equal_to<size_t>>
+          typename ValOutIt, typename CompValue, typename CompKey = std::equal_to<size_t>>
 size_t SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
                        KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
                        ValInIt val_last, KeyOutIt key_segments_out, ValOutIt val_out,
-                       CompValue comp, CompKey comp_key = thrust::equal_to<size_t>{}) {
+                       CompValue comp, CompKey comp_key = std::equal_to<size_t>{}) {
   using Key = thrust::pair<size_t, typename cuda::std::iterator_traits<ValInIt>::value_type>;
   auto unique_key_it = dh::MakeTransformIterator<Key>(
       thrust::make_counting_iterator(static_cast<size_t>(0)),
@@ -672,10 +674,9 @@ size_t SegmentedUniqueByKey(const thrust::detail::execution_policy_base<DerivedP
   using Key = thrust::pair<size_t, typename cuda::std::iterator_traits<KeyInIt>::value_type>;
 
   auto unique_key_it = dh::MakeTransformIterator<Key>(
-      thrust::make_counting_iterator(static_cast<size_t>(0)),
-      [=] __device__(size_t i) {
+      thrust::make_counting_iterator(static_cast<size_t>(0)), [=] __device__(size_t i) {
         size_t seg = dh::SegmentId(key_segments_first, key_segments_last, i);
-        return thrust::make_pair(seg, *(key_first + i));
+        return cuda::std::make_pair(seg, *(key_first + i));
       });
   size_t segments_len = key_segments_last - key_segments_first;
   thrust::fill(exec, key_segments_out, key_segments_out + segments_len, 0);
@@ -686,19 +687,19 @@ size_t SegmentedUniqueByKey(const thrust::detail::execution_policy_base<DerivedP
   auto reduce_it = thrust::make_transform_output_iterator(
       thrust::make_discard_iterator(),
       detail::SegmentedUniqueReduceOp<Key, SegOutIt>{key_segments_out});
-  auto uniques_ret = thrust::unique_by_key_copy(
-      exec, unique_key_it, unique_key_it + n_inputs, val_first, reduce_it,
-      val_out, [=] __device__(Key const &l, Key const &r) {
-        if (l.first == r.first) {
-          // In the same segment.
-          return comp(thrust::get<1>(l), thrust::get<1>(r));
-        }
-        return false;
-      });
+  auto uniques_ret =
+      thrust::unique_by_key_copy(exec, unique_key_it, unique_key_it + n_inputs, val_first,
+                                 reduce_it, val_out, [=] __device__(Key const &l, Key const &r) {
+                                   if (l.first == r.first) {
+                                     // In the same segment.
+                                     return comp(l.second, r.second);
+                                   }
+                                   return false;
+                                 });
   auto n_uniques = uniques_ret.second - val_out;
   CHECK_LE(n_uniques, n_inputs);
-  thrust::exclusive_scan(exec, key_segments_out,
-                         key_segments_out + segments_len, key_segments_out, 0);
+  thrust::exclusive_scan(exec, key_segments_out, key_segments_out + segments_len, key_segments_out,
+                         0);
   return n_uniques;
 }
 
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 44f4f6a3a5ad..24c7219f5c87 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -6,6 +6,9 @@
 
 #include <thrust/logical.h>  // for any_of
 
+#include <cstddef>     // for size_t
+#include <functional>  // for equal_to
+
 #include "categorical.h"
 #include "common.h"          // for HumanMemUnit
 #include "cuda_context.cuh"  // for CUDAContext
@@ -204,8 +207,8 @@ class SketchContainer {
   SketchContainer& operator=(const SketchContainer&) = delete;
 
   /* \brief Removes all the duplicated elements in quantile structure. */
-  template <typename KeyComp = thrust::equal_to<size_t>>
-  size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
+  template <typename KeyComp = std::equal_to<size_t>>
+  std::size_t Unique(Context const* ctx, KeyComp key_comp = std::equal_to<size_t>{}) {
     timer_.Start(__func__);
     curt::SetDevice(ctx->Ordinal());
     this->columns_ptr_.SetDevice(ctx->Device());
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 622e5397f273..3e7edacb666e 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -6,10 +6,11 @@
 #include <thrust/iterator/counting_iterator.h>          // for make_counting_iterator
 #include <thrust/iterator/transform_output_iterator.h>  // for transform_output_iterator
 
-#include <algorithm>  // for copy
-#include <limits>     // for numeric_limits
-#include <utility>    // for move
-#include <vector>     // for vector
+#include <algorithm>          // for copy
+#include <cuda/std/iterator>  // for distance
+#include <limits>             // for numeric_limits
+#include <utility>            // for move
+#include <vector>             // for vector
 
 #include "../common/algorithm.cuh"          // for InclusiveScan
 #include "../common/categorical.h"          // for IsCat
@@ -75,13 +76,13 @@ __global__ void CompressBinEllpackKernel(
     auto row_end = entries + row_ptrs[irow + 1] - row_ptrs[0];
     auto it = thrust::make_transform_iterator(thrust::make_counting_iterator(0ul),
                                               [=](std::size_t i) { return row_beg[i].index; });
-    auto it_end = it + thrust::distance(row_beg, row_end);
+    auto it_end = it + cuda::std::distance(row_beg, row_end);
     auto res_it = thrust::lower_bound(thrust::seq, it, it_end, cpr_fidx);
     if (res_it == it_end || cpr_fidx != *res_it) {
       wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + cpr_fidx);
       return;
     }
-    cpr_fidx = thrust::distance(it, res_it);
+    cpr_fidx = cuda::std::distance(it, res_it);
     SPAN_CHECK(cpr_fidx < row_length);
   }
 
diff --git a/src/encoder/ordinal.cuh b/src/encoder/ordinal.cuh
index 2759395f2cac..2fa61bf06242 100644
--- a/src/encoder/ordinal.cuh
+++ b/src/encoder/ordinal.cuh
@@ -223,8 +223,7 @@ void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc,
    * Check consistency.
    */
   auto check_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0ul),
-      cuda::proclaim_return_type<bool>([=] __device__(std::size_t i) {
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
         auto const& l_f = orig_enc.columns[i];
         auto const& r_f = new_enc.columns[i];
         if (l_f.index() != r_f.index()) {
@@ -233,10 +232,9 @@ void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc,
         auto l_is_empty = cuda::std::visit([](auto&& arg) { return arg.empty(); }, l_f);
         auto r_is_empty = cuda::std::visit([](auto&& arg) { return arg.empty(); }, r_f);
         return l_is_empty == r_is_empty;
-      }));
-  bool valid = thrust::reduce(
-      exec, check_it, check_it + new_enc.Size(), true,
-      cuda::proclaim_return_type<bool>([=] __device__(bool l, bool r) { return l && r; }));
+      });
+  bool valid = thrust::reduce(exec, check_it, check_it + new_enc.Size(), true,
+                              [=] XGBOOST_DEVICE(bool l, bool r) -> bool { return l && r; });
   if (!valid) {
     policy.Error(
         "Invalid new DataFrame. "
@@ -282,10 +280,9 @@ void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc,
         f_mapping[i - f_beg] = idx;
       });
 
-  auto err_it = thrust::find_if(exec, dh::tcbegin(mapping), dh::tcend(mapping),
-                                cuda::proclaim_return_type<bool>([=] __device__(std::int32_t v) {
-                                  return v == detail::NotFound();
-                                }));
+  auto err_it = thrust::find_if(
+      exec, dh::tcbegin(mapping), dh::tcend(mapping),
+      [=] XGBOOST_DEVICE(std::int32_t v) -> bool { return v == detail::NotFound(); });
 
   if (err_it != dh::tcend(mapping)) {
     // Report missing cat.
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index f5ad30ffd39d..a5f45d6ba76d 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -6,7 +6,8 @@
 #include <thrust/scan.h>
 
 #include <cassert>
-#include <cub/cub.cuh>  // NOLINT
+#include <cuda/std/utility>  // for pair
+#include <functional>        // for equal_to
 #include <limits>
 #include <memory>
 #include <tuple>
@@ -372,15 +373,9 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
   dh::TemporaryArray<uint32_t> unique_class_ptr(d_class_ptr.size());
   auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
   auto n_uniques = dh::SegmentedUniqueByKey(
-      ctx->CUDACtx()->TP(),
-      dh::tbegin(d_class_ptr),
-      dh::tend(d_class_ptr),
-      uni_key,
-      uni_key + d_sorted_idx.size(),
-      dh::tbegin(d_unique_idx),
-      d_unique_class_ptr.data(),
-      dh::tbegin(d_unique_idx),
-      thrust::equal_to<thrust::pair<uint32_t, float>>{});
+      ctx->CUDACtx()->TP(), dh::tbegin(d_class_ptr), dh::tend(d_class_ptr), uni_key,
+      uni_key + d_sorted_idx.size(), dh::tbegin(d_unique_idx), d_unique_class_ptr.data(),
+      dh::tbegin(d_unique_idx), std::equal_to<thrust::pair<uint32_t, float>>{});
   d_unique_idx = d_unique_idx.subspan(0, n_uniques);
 
   auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; };
@@ -746,15 +741,9 @@ std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
   dh::TemporaryArray<uint32_t> unique_class_ptr(d_group_ptr.size());
   auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
   auto n_uniques = dh::SegmentedUniqueByKey(
-      ctx->CUDACtx()->TP(),
-      dh::tbegin(d_group_ptr),
-      dh::tend(d_group_ptr),
-      uni_key,
-      uni_key + d_sorted_idx.size(),
-      dh::tbegin(d_unique_idx),
-      d_unique_class_ptr.data(),
-      dh::tbegin(d_unique_idx),
-      thrust::equal_to<thrust::pair<uint32_t, float>>{});
+      ctx->CUDACtx()->TP(), dh::tbegin(d_group_ptr), dh::tend(d_group_ptr), uni_key,
+      uni_key + d_sorted_idx.size(), dh::tbegin(d_unique_idx), d_unique_class_ptr.data(),
+      dh::tbegin(d_unique_idx), std::equal_to<cuda::std::pair<uint32_t, float>>{});
   d_unique_idx = d_unique_idx.subspan(0, n_uniques);
 
   auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) {
@@ -861,8 +850,8 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
         return thrust::make_pair(y * w, (1.0 - y) * w);
       });
   thrust::reduce_by_key(ctx->CUDACtx()->CTP(), key_it, key_it + predts.size(), val_it,
-                        thrust::make_discard_iterator(), totals.begin(), thrust::equal_to<size_t>{},
-                        PairPlus<double, double>{});
+                        thrust::make_discard_iterator(), totals.begin(), std::equal_to<size_t>{},
+                        PairPlus<double, double>{});  // NOLINT
 
   /**
    * Calculate AUC
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 824cdbf19ebf..785a8d0c8d15 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -5,10 +5,9 @@
 #include <thrust/random.h>
 #include <thrust/sort.h>  // for sort
 #include <thrust/transform.h>
-#include <xgboost/host_device_vector.h>
-#include <xgboost/logging.h>
 
-#include <cstddef>  // for size_t
+#include <cstddef>            // for size_t
+#include <cuda/std/iterator>  // for distance
 #include <limits>
 #include <utility>
 
@@ -18,6 +17,8 @@
 #include "../../data/iterative_dmatrix.h"  // for IterativeDMatrix
 #include "../param.h"
 #include "gradient_based_sampler.cuh"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/logging.h"
 
 namespace xgboost::tree {
 /*! \brief A functor that returns random weights. */
@@ -374,6 +375,6 @@ size_t GradientBasedSampler::CalculateThresholdIndex(Context const* ctx,
                     SampleRateDelta(threshold, gpair.size(), sample_rows));
   thrust::device_ptr<float> min =
       thrust::min_element(cuctx->CTP(), dh::tbegin(grad_sum), dh::tend(grad_sum));
-  return thrust::distance(dh::tbegin(grad_sum), min) + 1;
+  return cuda::std::distance(dh::tbegin(grad_sum), min) + 1;
 }
 };  // namespace xgboost::tree
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 169516c676fc..8ff413cc22cb 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <thrust/device_vector.h>
 #include <thrust/sort.h>  // for is_sorted
@@ -7,6 +7,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>  // for equal_to
 #include <vector>
 
 #include "../../../src/common/cuda_context.cuh"
@@ -67,7 +68,7 @@ TEST(SegmentedUnique, Basic) {
   size_t n_uniques = dh::SegmentedUnique(
       ctx.CUDACtx()->CTP(), d_segments.data().get(), d_segments.data().get() + d_segments.size(),
       d_values.data().get(), d_values.data().get() + d_values.size(), d_segs_out.data().get(),
-      d_vals_out.data().get(), thrust::equal_to<float>{});
+      d_vals_out.data().get(), std::equal_to{});
   CHECK_EQ(n_uniques, 5);
 
   std::vector<float> values_sol{0.1f, 0.2f, 0.3f, 0.62448811531066895f, 0.4f};
@@ -85,7 +86,7 @@ TEST(SegmentedUnique, Basic) {
   n_uniques = dh::SegmentedUnique(
       ctx.CUDACtx()->CTP(), d_segments.data().get(), d_segments.data().get() + d_segments.size(),
       d_values.data().get(), d_values.data().get() + d_values.size(), d_segs_out.data().get(),
-      d_vals_out.data().get(), thrust::equal_to<float>{});
+      d_vals_out.data().get(), std::equal_to{});
   ASSERT_EQ(n_uniques, values.size());
   for (size_t i = 0 ; i < values.size(); i ++) {
     ASSERT_EQ(d_vals_out[i], values[i]);

From f567d94e852d3f4adcbd1bb31a0a01458223a3ab Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 14:23:07 +0800
Subject: [PATCH 217/224] Add Python 3.14 classifier. (#11793)

---
 ops/script/pypi_variants.py      | 9 +++++++--
 python-package/pyproject.toml    | 3 ++-
 python-package/pyproject.toml.in | 1 +
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/ops/script/pypi_variants.py b/ops/script/pypi_variants.py
index c18b11a472c4..22717f5c1cd1 100644
--- a/ops/script/pypi_variants.py
+++ b/ops/script/pypi_variants.py
@@ -76,7 +76,9 @@ def make_pyproject(
         NCCL, NCCL_WHL.format(require_nccl_dep) if require_nccl_dep != "na" else ""
     )
     pyproject = (
-        f"# Generated by `{os.path.basename(__file__)}`, don't edit.\n" + pyproject
+        f"# Generated by `{os.path.basename(__file__)}`, don't edit: "
+        f"'--use-suffix={use_suffix} --require-nccl-dep={require_nccl_dep} "
+        f"--create-stub={create_stub}'\n" + pyproject
     )
 
     with open(OUT_PATH, "w") as fd:
@@ -90,7 +92,10 @@ def make_pyproject(
         type=str,
         choices=["na", "cpu"] + CUDA_VARIANTS,
         default="na",
-        help="When using this option, rename the package name to xgboost-[suffix]. Set to 'na' to disable",
+        help=(
+            "When using this option, rename the package name to xgboost-[suffix]. "
+            "Set to 'na' to disable"
+        ),
     )
     parser.add_argument(
         "--require-nccl-dep",
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index af4230ea2835..46e95235d76f 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -1,4 +1,4 @@
-# Generated by `pypi_variants.py`, don't edit.
+# Generated by `pypi_variants.py`, don't edit: '--use-suffix=na --require-nccl-dep=cu12 --create-stub=False'
 [build-system]
 requires = [
     "hatchling>=1.12.1",
@@ -29,6 +29,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
     "numpy",
diff --git a/python-package/pyproject.toml.in b/python-package/pyproject.toml.in
index fc4584e880b0..01c7d60e15ca 100644
--- a/python-package/pyproject.toml.in
+++ b/python-package/pyproject.toml.in
@@ -28,6 +28,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
     "numpy",

From a591369391e1f2566a49756eafb800a1c93b8808 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 14:25:28 +0800
Subject: [PATCH 218/224] Add Python type hint for tests and demos. (#11795)

---
 demo/guide-python/cross_validation.py         |  9 ++-
 demo/guide-python/custom_rmsle.py             | 18 +++---
 demo/guide-python/evals_result.py             |  3 +-
 demo/guide-python/predict_first_ntree.py      |  4 +-
 demo/guide-python/spark_estimator_examples.py |  5 +-
 ops/script/lint_python.py                     | 29 +---------
 python-package/xgboost/sklearn.py             |  2 +-
 tests/python-gpu/conftest.py                  | 10 +++-
 .../test_device_quantile_dmatrix.py           |  2 +-
 tests/python-gpu/test_from_cudf.py            | 56 ++++++++++---------
 tests/python-gpu/test_from_cupy.py            | 39 +++++++------
 tests/python-gpu/test_gpu_demos.py            | 11 ++--
 tests/python-gpu/test_gpu_eval_metrics.py     |  2 +-
 .../test_gpu_interaction_constraints.py       |  2 +-
 tests/python-gpu/test_gpu_linear.py           | 23 ++++++--
 tests/python-gpu/test_gpu_pickling.py         | 18 +++---
 tests/python-gpu/test_gpu_prediction.py       | 35 +++++++-----
 tests/python-gpu/test_gpu_ranking.py          |  3 +-
 tests/python-gpu/test_gpu_updaters.py         | 20 ++++---
 tests/python-gpu/test_gpu_with_sklearn.py     | 34 +++++++----
 tests/python-gpu/test_large_input.py          |  2 +-
 .../python-gpu/test_monotonic_constraints.py  |  8 +--
 .../test_with_dask/test_demos.py              |  8 +--
 .../test_with_dask/test_with_dask.py          | 37 +++++++-----
 24 files changed, 211 insertions(+), 169 deletions(-)

diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py
index a33a16c36f04..27736aa6f406 100644
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@@ -4,6 +4,7 @@
 """
 
 import os
+from typing import Any, Dict, Tuple
 
 import numpy as np
 
@@ -54,7 +55,9 @@
 # used to return the preprocessed training, test data, and parameter
 # we can use this to do weight rescale, etc.
 # as a example, we try to set scale_pos_weight
-def fpreproc(dtrain, dtest, param):
+def fpreproc(
+    dtrain: xgb.DMatrix, dtest: xgb.DMatrix, param: Any
+) -> Tuple[xgb.DMatrix, xgb.DMatrix, Dict[str, Any]]:
     label = dtrain.get_label()
     ratio = float(np.sum(label == 0)) / np.sum(label == 1)
     param["scale_pos_weight"] = ratio
@@ -74,7 +77,7 @@ def fpreproc(dtrain, dtest, param):
 print("running cross validation, with customized loss function")
 
 
-def logregobj(preds, dtrain):
+def logregobj(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
     labels = dtrain.get_label()
     preds = 1.0 / (1.0 + np.exp(-preds))
     grad = preds - labels
@@ -82,7 +85,7 @@ def logregobj(preds, dtrain):
     return grad, hess
 
 
-def evalerror(preds, dtrain):
+def evalerror(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
     labels = dtrain.get_label()
     preds = 1.0 / (1.0 + np.exp(-preds))
     return "error", float(sum(labels != (preds > 0.0))) / len(labels)
diff --git a/demo/guide-python/custom_rmsle.py b/demo/guide-python/custom_rmsle.py
index b4a7d94ec8a8..c958b298d1e1 100644
--- a/demo/guide-python/custom_rmsle.py
+++ b/demo/guide-python/custom_rmsle.py
@@ -18,7 +18,6 @@
 from time import time
 from typing import Dict, List, Tuple
 
-import matplotlib
 import numpy as np
 from matplotlib import pyplot as plt
 
@@ -136,7 +135,7 @@ def squared_log(predt: np.ndarray,
     def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
         ''' Root mean squared log error metric.
 
-        :math:`\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}`
+        :math:`\\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}`
         '''
         y = dtrain.get_label()
         predt[predt < -1] = -1 + 1e-6
@@ -156,11 +155,16 @@ def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
     return results
 
 
-def plot_history(rmse_evals, rmsle_evals, py_rmsle_evals):
+def plot_history(
+    rmse_evals: Dict[str, Dict],
+    rmsle_evals: Dict[str, Dict],
+    py_rmsle_evals: Dict[str, Dict]
+) -> None:
     fig, axs = plt.subplots(3, 1)
-    ax0: matplotlib.axes.Axes = axs[0]
-    ax1: matplotlib.axes.Axes = axs[1]
-    ax2: matplotlib.axes.Axes = axs[2]
+    assert isinstance(axs, np.ndarray)
+    ax0 = axs[0]
+    ax1 = axs[1]
+    ax2 = axs[2]
 
     x = np.arange(0, kBoostRound, 1)
 
@@ -177,7 +181,7 @@ def plot_history(rmse_evals, rmsle_evals, py_rmsle_evals):
     ax2.legend()
 
 
-def main(args):
+def main(args: argparse.Namespace) -> None:
     dtrain, dtest = generate_data()
     rmse_evals = native_rmse(dtrain, dtest)
     rmsle_evals = native_rmsle(dtrain, dtest)
diff --git a/demo/guide-python/evals_result.py b/demo/guide-python/evals_result.py
index 7b9da96da52a..2ea853a090b1 100644
--- a/demo/guide-python/evals_result.py
+++ b/demo/guide-python/evals_result.py
@@ -3,6 +3,7 @@
 ======================================================
 """
 import os
+from typing import Any, Dict
 
 import xgboost as xgb
 
@@ -24,7 +25,7 @@
 num_round = 2
 watchlist = [(dtest, "eval"), (dtrain, "train")]
 
-evals_result = {}
+evals_result: Dict[str, Any] = {}
 bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)
 
 print("Access logloss metric directly from evals_result:")
diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py
index 78137b4e1103..312522fc5b6a 100644
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@@ -14,7 +14,7 @@
 test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test")
 
 
-def native_interface():
+def native_interface() -> None:
     # load data in do training
     dtrain = xgb.DMatrix(train + "?format=libsvm")
     dtest = xgb.DMatrix(test + "?format=libsvm")
@@ -34,7 +34,7 @@ def native_interface():
     print("error of ypred2=%f" % (np.sum((ypred2 > 0.5) != label) / float(len(label))))
 
 
-def sklearn_interface():
+def sklearn_interface() -> None:
     X_train, y_train = load_svmlight_file(train)
     X_test, y_test = load_svmlight_file(test)
     clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1)
diff --git a/demo/guide-python/spark_estimator_examples.py b/demo/guide-python/spark_estimator_examples.py
index ac36065bc300..2437d2fd59a2 100644
--- a/demo/guide-python/spark_estimator_examples.py
+++ b/demo/guide-python/spark_estimator_examples.py
@@ -5,10 +5,11 @@
 @author: Weichen Xu
 """
 
+import numpy as np
 import sklearn.datasets
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
 from pyspark.ml.linalg import Vectors
-from pyspark.sql import SparkSession
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import rand
 from sklearn.model_selection import train_test_split
 
@@ -17,7 +18,7 @@
 spark = SparkSession.builder.master("local[*]").getOrCreate()
 
 
-def create_spark_df(X, y):
+def create_spark_df(X: np.ndarray, y: np.ndarray) -> DataFrame:
     return spark.createDataFrame(
         spark.sparkContext.parallelize(
             [(Vectors.dense(features), float(label)) for features, label in zip(X, y)]
diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py
index f880f9703a81..96b476d19471 100644
--- a/ops/script/lint_python.py
+++ b/ops/script/lint_python.py
@@ -111,39 +111,16 @@ class LintersPaths:
         "tests/python/test_model_io.py",
         "tests/python/test_ordinal.py",
         "tests/python/test_interaction_constraints.py",
-        "tests/python-gpu/test_gpu_callbacks.py",
-        "tests/python-gpu/test_gpu_data_iterator.py",
-        "tests/python-gpu/test_gpu_ordinal.py",
-        "tests/python-gpu/load_pickle.py",
-        "tests/python-gpu/test_gpu_training_continuation.py",
-        "tests/python-gpu/test_gpu_plotting.py",
-        "tests/python-gpu/test_gpu_parse_tree.py",
+        "tests/python-gpu/",
         "tests/test_distributed/test_federated/",
         "tests/test_distributed/test_gpu_federated/",
-        "tests/test_distributed/test_with_dask/test_ranking.py",
-        "tests/test_distributed/test_with_dask/test_external_memory.py",
+        "tests/test_distributed/test_with_dask/",
         "tests/test_distributed/test_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_dask/",
         # demo
         "demo/dask/",
-        "demo/guide-python/custom_softmax.py",
-        "demo/guide-python/external_memory.py",
-        "demo/guide-python/distributed_extmem_basic.py",
-        "demo/guide-python/sklearn_examples.py",
-        "demo/guide-python/continuation.py",
-        "demo/guide-python/callbacks.py",
-        "demo/guide-python/update_process.py",
-        "demo/guide-python/cat_in_the_dat.py",
-        "demo/guide-python/categorical.py",
-        "demo/guide-python/cat_pipeline.py",
-        "demo/guide-python/feature_weights.py",
-        "demo/guide-python/model_parser.py",
-        "demo/guide-python/individual_trees.py",
-        "demo/guide-python/quantile_regression.py",
-        "demo/guide-python/quantile_data_iterator.py",
-        "demo/guide-python/multioutput_regression.py",
-        "demo/guide-python/learning_to_rank.py",
+        "demo/guide-python/",
         "demo/aft_survival/aft_survival_viz_demo.py",
         # CI
         "ops/",
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 5b053686026b..909133f1ae33 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -103,7 +103,7 @@ def __call__(
         self,
         y_true: ArrayLike,
         y_pred: ArrayLike,
-        sample_weight: Optional[ArrayLike],
+        sample_weight: Optional[ArrayLike] = None,
     ) -> Tuple[ArrayLike, ArrayLike]: ...
 
 
diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py
index 40a12f020fc8..8ce9aabcb5b2 100644
--- a/tests/python-gpu/conftest.py
+++ b/tests/python-gpu/conftest.py
@@ -1,14 +1,16 @@
+from typing import Any, List
+
 import pytest
 
 from xgboost import testing as tm
 
 
-def has_rmm():
+def has_rmm() -> bool:
     return tm.no_rmm()["condition"]
 
 
 @pytest.fixture(scope="session", autouse=True)
-def setup_rmm_pool(request, pytestconfig):
+def setup_rmm_pool(request: Any, pytestconfig: pytest.Config) -> None:
     tm.setup_rmm_pool(request, pytestconfig)
 
 
@@ -18,7 +20,9 @@ def pytest_addoption(parser: pytest.Parser) -> None:
     )
 
 
-def pytest_collection_modifyitems(config, items):
+def pytest_collection_modifyitems(
+    config: pytest.Config, items: List[pytest.Item]
+) -> None:
     if config.getoption("--use-rmm-pool"):
         blocklist = [
             "python-gpu/test_gpu_demos.py::test_dask_training",
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index a407e487b182..9f62b76a7c8a 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -203,7 +203,7 @@ def test_ref_dmatrix(self) -> None:
         strategies.fractions(0, 0.99),
     )
     @settings(print_blob=True, deadline=None)
-    def test_to_csr(self, n_samples, n_features, sparsity) -> None:
+    def test_to_csr(self, n_samples: int, n_features: int, sparsity: float) -> None:
         import cupy as cp
 
         X, y = tm.make_sparse_regression(n_samples, n_features, sparsity, False)
diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py
index a4d60170c816..26354ac7303d 100644
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -1,4 +1,5 @@
 import json
+from typing import TYPE_CHECKING, Any, Callable, Dict, Type
 
 import numpy as np
 import pytest
@@ -8,7 +9,10 @@
 from xgboost.compat import is_dataframe
 from xgboost.testing.data import run_base_margin_info
 
-cudf = pytest.importorskip("cudf")
+if TYPE_CHECKING:
+    import cudf
+else:
+    cudf = pytest.importorskip("cudf")
 
 
 def test_type_check() -> None:
@@ -17,7 +21,9 @@ def test_type_check() -> None:
     assert is_dataframe(df.a)
 
 
-def dmatrix_from_cudf(input_type, DMatrixT, missing=np.nan):
+def dmatrix_from_cudf(
+    input_type: Any, DMatrixT: Type[xgb.DMatrix], missing: float = np.nan
+) -> None:
     """Test constructing DMatrix from cudf"""
     import pandas as pd
 
@@ -43,7 +49,7 @@ def dmatrix_from_cudf(input_type, DMatrixT, missing=np.nan):
     assert dtrain.num_row() == kRows
 
 
-def _test_from_cudf(DMatrixT):
+def _test_from_cudf(DMatrixT: Type[xgb.DMatrix]) -> None:
     """Test constructing DMatrix from cudf"""
     dmatrix_from_cudf(np.float32, DMatrixT, np.nan)
     dmatrix_from_cudf(np.float64, DMatrixT, np.nan)
@@ -79,7 +85,7 @@ def _test_from_cudf(DMatrixT):
     assert dtrain.num_row() == 5
 
 
-def _test_cudf_training(DMatrixT):
+def _test_cudf_training(DMatrixT: Type[xgb.DMatrix]) -> None:
     import pandas as pd
     from cudf import DataFrame as df
 
@@ -91,7 +97,7 @@ def _test_cudf_training(DMatrixT):
     base_margin = np.random.random(50)
     cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
 
-    evals_result_cudf = {}
+    evals_result_cudf: Dict[str, Any] = {}
     dtrain_cudf = DMatrixT(
         df.from_pandas(X),
         df.from_pandas(y),
@@ -105,7 +111,7 @@ def _test_cudf_training(DMatrixT):
         evals=[(dtrain_cudf, "train")],
         evals_result=evals_result_cudf,
     )
-    evals_result_np = {}
+    evals_result_np: Dict[str, Any] = {}
     dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
     xgb.train(
         params, dtrain_np, evals=[(dtrain_np, "train")], evals_result=evals_result_np
@@ -115,7 +121,7 @@ def _test_cudf_training(DMatrixT):
     )
 
 
-def _test_cudf_metainfo(DMatrixT):
+def _test_cudf_metainfo(DMatrixT: Type[xgb.DMatrix]) -> None:
     import pandas as pd
     from cudf import DataFrame as df
 
@@ -176,27 +182,27 @@ class TestFromColumnar:
     Arrow specification."""
 
     @pytest.mark.skipif(**tm.no_cudf())
-    def test_simple_dmatrix_from_cudf(self):
+    def test_simple_dmatrix_from_cudf(self) -> None:
         _test_from_cudf(xgb.DMatrix)
 
     @pytest.mark.skipif(**tm.no_cudf())
-    def test_device_dmatrix_from_cudf(self):
+    def test_device_dmatrix_from_cudf(self) -> None:
         _test_from_cudf(xgb.QuantileDMatrix)
 
     @pytest.mark.skipif(**tm.no_cudf())
-    def test_cudf_training_simple_dmatrix(self):
+    def test_cudf_training_simple_dmatrix(self) -> None:
         _test_cudf_training(xgb.DMatrix)
 
     @pytest.mark.skipif(**tm.no_cudf())
-    def test_cudf_training_device_dmatrix(self):
+    def test_cudf_training_device_dmatrix(self) -> None:
         _test_cudf_training(xgb.QuantileDMatrix)
 
     @pytest.mark.skipif(**tm.no_cudf())
-    def test_cudf_metainfo_simple_dmatrix(self):
+    def test_cudf_metainfo_simple_dmatrix(self) -> None:
         _test_cudf_metainfo(xgb.DMatrix)
 
     @pytest.mark.skipif(**tm.no_cudf())
-    def test_cudf_metainfo_device_dmatrix(self):
+    def test_cudf_metainfo_device_dmatrix(self) -> None:
         _test_cudf_metainfo(xgb.QuantileDMatrix)
 
     @pytest.mark.skipif(**tm.no_cudf())
@@ -264,7 +270,7 @@ def test_cudf_categorical(self) -> None:
 @pytest.mark.skipif(**tm.no_cupy())
 @pytest.mark.skipif(**tm.no_sklearn())
 @pytest.mark.skipif(**tm.no_pandas())
-def test_cudf_training_with_sklearn():
+def test_cudf_training_with_sklearn() -> None:
     import pandas as pd
     from cudf import DataFrame as df
     from cudf import Series as ss
@@ -305,7 +311,7 @@ class IterForDMatrixTest(xgb.core.DataIter):
     ROWS_PER_BATCH = 100  # data is splited by rows
     BATCHES = 16
 
-    def __init__(self, categorical):
+    def __init__(self, categorical: bool) -> None:
         """Generate some random data for demostration.
 
         Actual data can be anything that is currently supported by XGBoost.
@@ -334,37 +340,37 @@ def __init__(self, categorical):
         self.it = 0  # set iterator to 0
         super().__init__(cache_prefix=None)
 
-    def as_array(self):
+    def as_array(self) -> "cudf.DataFrame":
         return cudf.concat(self._data)
 
-    def as_array_labels(self):
+    def as_array_labels(self) -> np.ndarray:
         return np.concatenate(self._labels)
 
-    def data(self):
+    def data(self) -> "cudf.DataFrame":
         """Utility function for obtaining current batch of data."""
         return self._data[self.it]
 
-    def labels(self):
+    def labels(self) -> Any:
         """Utility function for obtaining current batch of label."""
         return self._labels[self.it]
 
-    def reset(self):
+    def reset(self) -> None:
         """Reset the iterator"""
         self.it = 0
 
-    def next(self, input_data):
+    def next(self, input_data: Callable) -> bool:
         """Yield next batch of data"""
         if self.it == len(self._data):
-            # Return 0 when there's no more batch.
-            return 0
+            # Return False when there's no more batch.
+            return False
         input_data(data=self.data(), label=self.labels())
         self.it += 1
-        return 1
+        return True
 
 
 @pytest.mark.skipif(**tm.no_cudf())
 @pytest.mark.parametrize("enable_categorical", [True, False])
-def test_from_cudf_iter(enable_categorical):
+def test_from_cudf_iter(enable_categorical: bool) -> None:
     rounds = 100
     it = IterForDMatrixTest(enable_categorical)
     params = {"tree_method": "hist", "device": "cuda"}
diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py
index 56e331d66cf7..175a8c05730b 100644
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -1,4 +1,5 @@
 import json
+from typing import Any, Dict, Type
 
 import numpy as np
 import pytest
@@ -18,7 +19,9 @@ def test_array_interface() -> None:
     np.testing.assert_equal(cp.asnumpy(arr), cp.asnumpy(ret))
 
 
-def dmatrix_from_cupy(input_type, DMatrixT, missing=np.nan):
+def dmatrix_from_cupy(
+    input_type: Any, DMatrixT: Type[xgb.DMatrix], missing: float = np.nan
+) -> xgb.DMatrix:
     """Test constructing DMatrix from cupy"""
     kRows = 80
     kCols = 3
@@ -44,7 +47,7 @@ def dmatrix_from_cupy(input_type, DMatrixT, missing=np.nan):
     return dtrain
 
 
-def _test_from_cupy(DMatrixT):
+def _test_from_cupy(DMatrixT: Type[xgb.DMatrix]) -> None:
     """Test constructing DMatrix from cupy"""
     dmatrix_from_cupy(np.float16, DMatrixT, np.nan)
     dmatrix_from_cupy(np.float32, DMatrixT, np.nan)
@@ -64,7 +67,7 @@ def _test_from_cupy(DMatrixT):
         DMatrixT(X, label=y)
 
 
-def _test_cupy_training(DMatrixT):
+def _test_cupy_training(DMatrixT: Type[xgb.DMatrix]) -> None:
     np.random.seed(1)
     cp.random.seed(np.uint64(1))
     X = cp.random.randn(50, 10, dtype="float32")
@@ -74,13 +77,13 @@ def _test_cupy_training(DMatrixT):
     base_margin = np.random.random(50)
     cupy_base_margin = cp.array(base_margin)
 
-    evals_result_cupy = {}
+    evals_result_cupy: Dict[str, Any] = {}
     dtrain_cp = DMatrixT(X, y, weight=cupy_weights, base_margin=cupy_base_margin)
     params = {"tree_method": "hist", "device": "cuda:0"}
     xgb.train(
         params, dtrain_cp, evals=[(dtrain_cp, "train")], evals_result=evals_result_cupy
     )
-    evals_result_np = {}
+    evals_result_np: Dict[str, Any] = {}
     dtrain_np = xgb.DMatrix(
         cp.asnumpy(X), cp.asnumpy(y), weight=weights, base_margin=base_margin
     )
@@ -92,7 +95,7 @@ def _test_cupy_training(DMatrixT):
     )
 
 
-def _test_cupy_metainfo(DMatrixT):
+def _test_cupy_metainfo(DMatrixT: Type[xgb.DMatrix]) -> None:
     n = 100
     X = np.random.random((n, 2))
     dmat_cupy = DMatrixT(cp.array(X))
@@ -129,7 +132,7 @@ def _test_cupy_metainfo(DMatrixT):
 
 @pytest.mark.skipif(**tm.no_cupy())
 @pytest.mark.skipif(**tm.no_sklearn())
-def test_cupy_training_with_sklearn():
+def test_cupy_training_with_sklearn() -> None:
     np.random.seed(1)
     cp.random.seed(np.uint64(1))
     X = cp.random.randn(50, 10, dtype="float32")
@@ -156,37 +159,37 @@ class TestFromCupy:
     Arrow specification."""
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_simple_dmat_from_cupy(self):
+    def test_simple_dmat_from_cupy(self) -> None:
         _test_from_cupy(xgb.DMatrix)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_device_dmat_from_cupy(self):
+    def test_device_dmat_from_cupy(self) -> None:
         _test_from_cupy(xgb.QuantileDMatrix)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_cupy_training_device_dmat(self):
+    def test_cupy_training_device_dmat(self) -> None:
         _test_cupy_training(xgb.QuantileDMatrix)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_cupy_training_simple_dmat(self):
+    def test_cupy_training_simple_dmat(self) -> None:
         _test_cupy_training(xgb.DMatrix)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_cupy_metainfo_simple_dmat(self):
+    def test_cupy_metainfo_simple_dmat(self) -> None:
         _test_cupy_metainfo(xgb.DMatrix)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_cupy_metainfo_device_dmat(self):
+    def test_cupy_metainfo_device_dmat(self) -> None:
         _test_cupy_metainfo(xgb.QuantileDMatrix)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_dlpack_simple_dmat(self):
+    def test_dlpack_simple_dmat(self) -> None:
         n = 100
         X = cp.random.random((n, 2))
         xgb.DMatrix(X.toDlpack())
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_cupy_categorical(self):
+    def test_cupy_categorical(self) -> None:
         n_features = 10
         X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False)
         X = cp.asarray(X.values.astype(cp.float32))
@@ -198,7 +201,7 @@ def test_cupy_categorical(self):
         np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_dlpack_device_dmat(self):
+    def test_dlpack_device_dmat(self) -> None:
         n = 100
         X = cp.random.random((n, 2))
         m = xgb.QuantileDMatrix(X.toDlpack())
@@ -209,7 +212,7 @@ def test_dlpack_device_dmat(self):
             m.slice(rindex=[0, 1, 2])
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_qid(self):
+    def test_qid(self) -> None:
         rng = cp.random.RandomState(np.uint64(1994))
         rows = 100
         cols = 10
@@ -225,7 +228,7 @@ def test_qid(self):
 
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.mgpu
-    def test_specified_device(self):
+    def test_specified_device(self) -> None:
         cp.cuda.runtime.setDevice(0)
         dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
         with pytest.raises(xgb.core.XGBoostError, match="Invalid device ordinal"):
diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py
index 124f8b303db2..a7bfb778d5f4 100644
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -1,6 +1,5 @@
 import os
 import subprocess
-import sys
 
 import pytest
 
@@ -11,19 +10,19 @@
 
 
 @pytest.mark.skipif(**tm.no_cupy())
-def test_data_iterator():
+def test_data_iterator() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "quantile_data_iterator.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
 
 
-def test_update_process_demo():
+def test_update_process_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "update_process.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
 
 
-def test_categorical_demo():
+def test_categorical_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "categorical.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
@@ -31,7 +30,7 @@ def test_categorical_demo():
 
 @pytest.mark.skipif(**tm.no_rmm())
 @pytest.mark.skipif(**tm.no_cupy())
-def test_external_memory_demo():
+def test_external_memory_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "external_memory.py")
     cmd = ["python", script, "--device=cuda"]
     subprocess.check_call(cmd)
@@ -40,7 +39,7 @@ def test_external_memory_demo():
 @pytest.mark.skipif(**tm.no_rmm())
 @pytest.mark.skipif(**tm.no_cupy())
 @pytest.mark.mgpu
-def test_distributed_extmem_basic_demo():
+def test_distributed_extmem_basic_demo() -> None:
     script = os.path.join(PYTHON_DEMO_DIR, "distributed_extmem_basic.py")
     cmd = ["python", script, "--device=cuda"]
     subprocess.check_call(cmd)
diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py
index d44b117d3c3f..2f3d05f36b4b 100644
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -27,7 +27,7 @@ def test_roc_auc_multi(self, n_samples: int, weighted: bool) -> None:
         run_roc_auc_multi("hist", n_samples, weighted, "cuda")
 
     @pytest.mark.parametrize("n_samples", [4, 100, 1000])
-    def test_roc_auc_ltr(self, n_samples):
+    def test_roc_auc_ltr(self, n_samples: int) -> None:
         import numpy as np
 
         rng = np.random.RandomState(1994)
diff --git a/tests/python-gpu/test_gpu_interaction_constraints.py b/tests/python-gpu/test_gpu_interaction_constraints.py
index e9f31671d1e1..2c22fe91b12c 100644
--- a/tests/python-gpu/test_gpu_interaction_constraints.py
+++ b/tests/python-gpu/test_gpu_interaction_constraints.py
@@ -20,7 +20,7 @@ def test_training_accuracy(self, tree_method: str) -> None:
         training_accuracy(tree_method=tree_method, dpath=dpath, device="cuda")
 
     # case where different number of features can occur in the evaluator
-    def test_issue_8730(self):
+    def test_issue_8730(self) -> None:
         X = pd.DataFrame(
             zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)),
             columns=["A", "B", "C", "D"],
diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py
index e0b3f2583f4a..ace1238488ac 100644
--- a/tests/python-gpu/test_gpu_linear.py
+++ b/tests/python-gpu/test_gpu_linear.py
@@ -1,3 +1,5 @@
+from typing import Any, Dict
+
 import pytest
 from hypothesis import assume, given, note, settings, strategies
 
@@ -20,8 +22,10 @@
 )
 
 
-def train_result(param, dmat, num_rounds):
-    result = {}
+def train_result(
+    param: Dict[str, Any], dmat: xgb.DMatrix, num_rounds: int
+) -> Dict[str, Any]:
+    result: Dict[str, Any] = {}
     booster = xgb.train(
         param,
         dmat,
@@ -37,7 +41,9 @@ def train_result(param, dmat, num_rounds):
 class TestGPULinear:
     @given(parameter_strategy, strategies.integers(10, 50), tm.make_dataset_strategy())
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_gpu_coordinate(self, param, num_rounds, dataset):
+    def test_gpu_coordinate(
+        self, param: Dict[str, Any], num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
         assume(len(dataset.y) > 0)
         param["updater"] = "coord_descent"
         param["device"] = "cuda"
@@ -59,7 +65,14 @@ def test_gpu_coordinate(self, param, num_rounds, dataset):
         strategies.floats(1e-5, 0.8),
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
+    def test_gpu_coordinate_regularised(
+        self,
+        param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+        alpha: float,
+        lambd: float,
+    ) -> None:
         assume(len(dataset.y) > 0)
         param["updater"] = "coord_descent"
         param["device"] = "cuda"
@@ -73,7 +86,7 @@ def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lam
         assert tm.non_increasing([result[0], result[-1]])
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_gpu_coordinate_from_cupy(self):
+    def test_gpu_coordinate_from_cupy(self) -> None:
         # Training linear model is quite expensive, so we don't include it in
         # test_from_cupy.py
         import cupy
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
index f971416d1d51..f4219388255d 100644
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -3,7 +3,7 @@
 import os
 import pickle
 import subprocess
-from typing import Any, Dict
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
 import pytest
@@ -17,7 +17,7 @@
 pytestmark = tm.timeout(30)
 
 
-def build_dataset():
+def build_dataset() -> Tuple[np.ndarray, np.ndarray]:
     N = 10
     x = np.linspace(0, N * N, N * N)
     x = x.reshape((N, N))
@@ -25,12 +25,12 @@ def build_dataset():
     return x, y
 
 
-def save_pickle(bst, path):
+def save_pickle(bst: Union[xgb.Booster, xgb.XGBModel], path: str) -> None:
     with open(path, "wb") as fd:
         pickle.dump(bst, fd)
 
 
-def load_pickle(path):
+def load_pickle(path: str) -> Any:
     with open(path, "rb") as fd:
         bst = pickle.load(fd)
     return bst
@@ -39,7 +39,7 @@ def load_pickle(path):
 class TestPickling:
     args_template = ["pytest", "--verbose", "-s", "--fulltrace"]
 
-    def run_pickling(self, bst) -> None:
+    def run_pickling(self, bst: Union[xgb.Booster, xgb.XGBModel]) -> None:
         save_pickle(bst, model_path)
         args = [
             "pytest",
@@ -68,7 +68,7 @@ def run_pickling(self, bst) -> None:
 
     # TODO: This test is too slow
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_pickling(self):
+    def test_pickling(self) -> None:
         x, y = build_dataset()
         train_x = xgb.DMatrix(x, label=y)
 
@@ -87,7 +87,7 @@ def test_pickling(self):
         self.run_pickling(bst)
 
     @pytest.mark.mgpu
-    def test_wrap_gpu_id(self):
+    def test_wrap_gpu_id(self) -> None:
         X, y = build_dataset()
         dtrain = xgb.DMatrix(X, y)
 
@@ -108,7 +108,7 @@ def test_wrap_gpu_id(self):
         assert status == 0
         os.remove(model_path)
 
-    def test_pickled_context(self):
+    def test_pickled_context(self) -> None:
         x, y = tm.make_sparse_regression(10, 10, sparsity=0.8, as_dense=True)
         train_x = xgb.DMatrix(x, label=y)
 
@@ -180,7 +180,7 @@ def test_predict_sklearn_pickle(self) -> None:
         cpu_pred = model.predict(x, output_margin=True)
         np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
 
-    def test_training_on_cpu_only_env(self):
+    def test_training_on_cpu_only_env(self) -> None:
         cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
         env = os.environ.copy()
         env.update(cuda_environment)
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index cd76daa83bc2..8e2303749e54 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -37,7 +37,7 @@
 
 
 class TestGPUPredict:
-    def test_predict(self):
+    def test_predict(self) -> None:
         iterations = 10
         np.random.seed(1)
         test_num_rows = [10, 1000, 5000]
@@ -61,7 +61,7 @@ def test_predict(self):
                     label=[0, 1] * int(num_rows / 2),
                 )
                 watchlist = [(dtrain, "train"), (dval, "validation")]
-                res = {}
+                res: Dict[str, Any] = {}
                 param = {
                     "objective": "binary:logistic",
                     "eval_metric": "logloss",
@@ -91,7 +91,7 @@ def test_predict(self):
     # Test case for a bug where multiple batch predictions made on a
     # test set produce incorrect results
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_multi_predict(self):
+    def test_multi_predict(self) -> None:
         from sklearn.datasets import make_regression
         from sklearn.model_selection import train_test_split
 
@@ -116,7 +116,7 @@ def test_multi_predict(self):
         assert np.allclose(predict_gpu_0, predict_cpu)
 
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_sklearn(self):
+    def test_sklearn(self) -> None:
         m, n = 15000, 14
         tr_size = 2500
         X = np.random.rand(m, n)
@@ -186,7 +186,12 @@ def test_inplace_predict_device_type(self, device: str) -> None:
         np.testing.assert_allclose(predt_0, predt_4)
 
     def run_inplace_base_margin(
-        self, device: int, booster: xgb.Booster, dtrain: xgb.DMatrix, X, base_margin
+        self,
+        device: int,
+        booster: xgb.Booster,
+        dtrain: xgb.DMatrix,
+        X: Any,
+        base_margin: Any,
     ) -> None:
         import cupy as cp
 
@@ -240,7 +245,7 @@ def run_inplace_predict_cupy(self, device: int) -> None:
         predt_from_dmatrix = booster.predict(test)
         cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)
 
-        def predict_dense(x):
+        def predict_dense(x: cp.ndarray) -> bool:
             cp.cuda.runtime.setDevice(device)
             inplace_predt = booster.inplace_predict(x)
             d = xgb.DMatrix(x)
@@ -277,12 +282,12 @@ def predict_dense(x):
         cp.cuda.runtime.setDevice(0)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_inplace_predict_cupy(self):
+    def test_inplace_predict_cupy(self) -> None:
         self.run_inplace_predict_cupy(0)
 
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.mgpu
-    def test_inplace_predict_cupy_specified_device(self):
+    def test_inplace_predict_cupy_specified_device(self) -> None:
         import cupy as cp
 
         n_devices = cp.cuda.runtime.getDeviceCount()
@@ -291,7 +296,7 @@ def test_inplace_predict_cupy_specified_device(self):
 
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.skipif(**tm.no_cudf())
-    def test_inplace_predict_cudf(self):
+    def test_inplace_predict_cudf(self) -> None:
         import cudf
         import cupy as cp
         import pandas as pd
@@ -316,7 +321,7 @@ def test_inplace_predict_cudf(self):
 
         cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)
 
-        def predict_df(x):
+        def predict_df(x: cudf.DataFrame) -> bool:
             # column major array
             inplace_predt = booster.inplace_predict(x.values)
             d = xgb.DMatrix(x)
@@ -398,7 +403,7 @@ def test_shap_interactions(
             1e-3,
         )
 
-    def test_shap_categorical(self):
+    def test_shap_categorical(self) -> None:
         X, y = tm.make_categorical(100, 20, 7, onehot=False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
         booster = xgb.train(
@@ -487,7 +492,7 @@ def test_predict_leaf_dart(self, param: dict, dataset: tm.TestDataset) -> None:
         )
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_predict_categorical_split(self, df):
+    def test_predict_categorical_split(self, df: Any) -> None:
         from sklearn.metrics import root_mean_squared_error
 
         df = df.astype("category")
@@ -504,7 +509,7 @@ def test_predict_categorical_split(self, df):
             "device": "cuda:0",
         }
 
-        eval_history = {}
+        eval_history: Dict[str, Any] = {}
         bst = xgb.train(
             params,
             dtrain,
@@ -522,7 +527,7 @@ def test_predict_categorical_split(self, df):
 
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.parametrize("n_classes", [2, 3])
-    def test_predict_dart(self, n_classes):
+    def test_predict_dart(self, n_classes: int) -> None:
         import cupy as cp
         from sklearn.datasets import make_classification
 
@@ -576,7 +581,7 @@ def test_predict_dart(self, n_classes):
         cp.testing.assert_allclose(inplace, copied, atol=1e-6)
 
     @pytest.mark.skipif(**tm.no_cupy())
-    def test_dtypes(self):
+    def test_dtypes(self) -> None:
         import cupy as cp
 
         rows = 1000
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index 8d793756d9f1..4b284bf4c9f0 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -1,4 +1,3 @@
-import os
 from typing import Dict
 
 import numpy as np
@@ -104,7 +103,7 @@ def comp_training_with_rank_objective(
         ("rank:map", "map"),
     ],
 )
-def test_with_mq2008(objective, metric) -> None:
+def test_with_mq2008(objective: str, metric: str) -> None:
     (
         x_train,
         y_train,
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 958f7e6c80cb..cf51b9935ad9 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -33,7 +33,9 @@ class TestGPUUpdatersMulti:
         hist_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
     )
     @settings(deadline=None, max_examples=50, print_blob=True)
-    def test_hist(self, param, num_rounds, dataset):
+    def test_hist(
+        self, param: Dict[str, Any], num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
         param["tree_method"] = "hist"
         param["device"] = "cuda"
         param = dataset.set_params(param)
@@ -115,7 +117,7 @@ def test_gpu_approx(
 
     @given(tm.sparse_datasets_strategy)
     @settings(deadline=None, print_blob=True)
-    def test_sparse(self, dataset):
+    def test_sparse(self, dataset: tm.TestDataset) -> None:
         param = {"tree_method": "hist", "max_bin": 64}
         hist_result = train_result(param, dataset.get_dmat(), 16)
         note(str(hist_result))
@@ -138,7 +140,9 @@ def test_sparse(self, dataset):
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
-    def test_categorical_ohe(self, rows, cols, rounds, cats):
+    def test_categorical_ohe(
+        self, rows: int, cols: int, rounds: int, cats: int
+    ) -> None:
         check_categorical_ohe(
             rows=rows,
             cols=cols,
@@ -232,7 +236,7 @@ def test_categorical_missing(self, rows: int, cols: int, cats: int) -> None:
     def test_max_cat(self, tree_method: str) -> None:
         run_max_cat(tree_method, "cuda")
 
-    def test_categorical_32_cat(self):
+    def test_categorical_32_cat(self) -> None:
         """32 hits the bound of integer bitset, so special test"""
         rows = 1000
         check_categorical_ohe(
@@ -294,7 +298,7 @@ def test_external_memory(
             del m
             assert tm.non_increasing(external_result["train"][dataset.metric])
 
-    def test_empty_dmatrix_prediction(self):
+    def test_empty_dmatrix_prediction(self) -> None:
         # FIXME(trivialfis): This should be done with all updaters
         kRows = 0
         kCols = 100
@@ -313,9 +317,9 @@ def test_empty_dmatrix_prediction(self):
         )
 
         kRows = 100
-        X = np.random.randn(kRows, kCols)
+        X_test = np.random.randn(kRows, kCols)
 
-        dtest = xgb.DMatrix(X)
+        dtest = xgb.DMatrix(X_test)
         predictions = bst.predict(dtest)
         # non-distributed, 0.0 is returned due to base_score estimation with 0 gradient.
         np.testing.assert_allclose(predictions, 0.0, 1e-6)
@@ -346,7 +350,7 @@ def test_quantile_loss(self, weighted: bool) -> None:
         check_quantile_loss("hist", weighted, "cuda")
 
     @pytest.mark.skipif(**tm.no_pandas())
-    def test_issue8824(self):
+    def test_issue8824(self) -> None:
         # column sampling by node crashes because shared pointers go out of scope
         import pandas as pd
 
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index 31fb7223d2a9..e629d3639e5b 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -3,6 +3,7 @@
 import os
 import tempfile
 from concurrent.futures import ThreadPoolExecutor
+from typing import Any, List, Tuple
 
 import numpy as np
 import pytest
@@ -23,7 +24,7 @@
 rng = np.random.RandomState(1994)
 
 
-def test_gpu_binary_classification():
+def test_gpu_binary_classification() -> None:
     from sklearn.datasets import load_digits
     from sklearn.model_selection import KFold
 
@@ -83,7 +84,7 @@ def test_num_parallel_tree() -> None:
 @pytest.mark.skipif(**tm.no_pandas())
 @pytest.mark.skipif(**tm.no_cudf())
 @pytest.mark.skipif(**tm.no_sklearn())
-def test_categorical():
+def test_categorical() -> None:
     import cudf
     import cupy as cp
     import pandas as pd
@@ -118,15 +119,17 @@ def test_categorical():
             assert categories_sizes.shape[0] != 0
             np.testing.assert_allclose(categories_sizes, 1)
 
-    def check_predt(X, y):
+    def check_predt(X: Any, y: List[float]) -> None:
         reg = xgb.XGBRegressor(
             tree_method="hist", enable_categorical=True, n_estimators=64, device="cuda"
         )
         reg.fit(X, y)
         predts = reg.predict(X)
         booster = reg.get_booster()
-        assert "c" in booster.feature_types
-        assert len(booster.feature_types) == 1
+        feature_types = booster.feature_types
+        assert feature_types is not None
+        assert "c" in feature_types
+        assert len(feature_types) == 1
         inp_predts = booster.inplace_predict(X)
         if isinstance(inp_predts, cp.ndarray):
             inp_predts = cp.asnumpy(inp_predts)
@@ -143,7 +146,7 @@ def check_predt(X, y):
 
 @pytest.mark.skipif(**tm.no_cupy())
 @pytest.mark.skipif(**tm.no_cudf())
-def test_classififer():
+def test_classififer() -> None:
     import cudf
     import cupy as cp
     from sklearn.datasets import load_digits
@@ -208,6 +211,7 @@ def test_custom_objective(
     }
 
     obj = tm.softprob_obj(y.max() + 1, use_cupy=use_cupy, order=order, gdtype=gdtype)
+    assert callable(obj)
 
     clf = xgb.XGBClassifier(objective=obj, **params)
 
@@ -229,7 +233,9 @@ def test_custom_objective(
 
     params["n_estimators"] = 2
 
-    def wrong_shape(labels, predt):
+    def wrong_shape(
+        labels: np.ndarray, predt: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
         grad, hess = obj(labels, predt)
         return grad[:, :-1], hess[:, :-1]
 
@@ -237,7 +243,9 @@ def wrong_shape(labels, predt):
         clf = xgb.XGBClassifier(objective=wrong_shape, **params)
         clf.fit(X, y)
 
-    def wrong_shape_1(labels, predt):
+    def wrong_shape_1(
+        labels: np.ndarray, predt: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
         grad, hess = obj(labels, predt)
         return grad[:-1, :], hess[:-1, :]
 
@@ -245,7 +253,9 @@ def wrong_shape_1(labels, predt):
         clf = xgb.XGBClassifier(objective=wrong_shape_1, **params)
         clf.fit(X, y)
 
-    def wrong_shape_2(labels, predt):
+    def wrong_shape_2(
+        labels: np.ndarray, predt: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
         grad, hess = obj(labels, predt)
         return grad[:, :], hess[:-1, :]
 
@@ -253,7 +263,9 @@ def wrong_shape_2(labels, predt):
         clf = xgb.XGBClassifier(objective=wrong_shape_2, **params)
         clf.fit(X, y)
 
-    def wrong_shape_3(labels, predt):
+    def wrong_shape_3(
+        labels: np.ndarray, predt: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
         grad, hess = obj(labels, predt)
         grad = grad.reshape(grad.size)
         hess = hess.reshape(hess.size)
@@ -265,7 +277,7 @@ def wrong_shape_3(labels, predt):
 
 
 @pytest.mark.skipif(**tm.no_cudf())
-def test_ranking_qid_df():
+def test_ranking_qid_df() -> None:
     import cudf
 
     run_ranking_qid_df(cudf, "hist", "cuda")
diff --git a/tests/python-gpu/test_large_input.py b/tests/python-gpu/test_large_input.py
index abf3ff7abee5..0ec203621ae8 100644
--- a/tests/python-gpu/test_large_input.py
+++ b/tests/python-gpu/test_large_input.py
@@ -6,7 +6,7 @@
 
 
 # Test for integer overflow or out of memory exceptions
-def test_large_input():
+def test_large_input() -> None:
     available_bytes, _ = cp.cuda.runtime.memGetInfo()
     # 15 GB
     required_bytes = 1.5e10
diff --git a/tests/python-gpu/test_monotonic_constraints.py b/tests/python-gpu/test_monotonic_constraints.py
index 606f41b6d68f..baf64621059b 100644
--- a/tests/python-gpu/test_monotonic_constraints.py
+++ b/tests/python-gpu/test_monotonic_constraints.py
@@ -37,18 +37,18 @@ def assert_constraint(constraint: int, tree_method: str) -> None:
 
 
 @pytest.mark.skipif(**tm.no_sklearn())
-def test_gpu_hist_basic():
+def test_gpu_hist_basic() -> None:
     assert_constraint(1, "hist")
     assert_constraint(-1, "hist")
 
 
 @pytest.mark.skipif(**tm.no_sklearn())
-def test_gpu_approx_basic():
+def test_gpu_approx_basic() -> None:
     assert_constraint(1, "approx")
     assert_constraint(-1, "approx")
 
 
-def test_gpu_hist_depthwise():
+def test_gpu_hist_depthwise() -> None:
     params = {
         "tree_method": "hist",
         "grow_policy": "depthwise",
@@ -59,7 +59,7 @@ def test_gpu_hist_depthwise():
     is_correctly_constrained(model)
 
 
-def test_gpu_hist_lossguide():
+def test_gpu_hist_lossguide() -> None:
     params = {
         "tree_method": "hist",
         "grow_policy": "lossguide",
diff --git a/tests/test_distributed/test_with_dask/test_demos.py b/tests/test_distributed/test_with_dask/test_demos.py
index 051774ac0516..cbcd6322bfb1 100644
--- a/tests/test_distributed/test_with_dask/test_demos.py
+++ b/tests/test_distributed/test_with_dask/test_demos.py
@@ -7,14 +7,14 @@
 
 
 @pytest.mark.skipif(**tm.no_dask())
-def test_dask_cpu_training_demo():
+def test_dask_cpu_training_demo() -> None:
     script = os.path.join(tm.demo_dir(__file__), "dask", "cpu_training.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_dask())
-def test_dask_cpu_survival_demo():
+def test_dask_cpu_survival_demo() -> None:
     script = os.path.join(tm.demo_dir(__file__), "dask", "cpu_survival.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
@@ -23,14 +23,14 @@ def test_dask_cpu_survival_demo():
 # Not actually run on CI due to missing dask_ml.
 @pytest.mark.skipif(**tm.no_dask())
 @pytest.mark.skipif(**tm.no_dask_ml())
-def test_dask_callbacks_demo():
+def test_dask_callbacks_demo() -> None:
     script = os.path.join(tm.demo_dir(__file__), "dask", "dask_callbacks.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
 
 
 @pytest.mark.skipif(**tm.no_dask())
-def test_dask_sklearn_demo():
+def test_dask_sklearn_demo() -> None:
     script = os.path.join(tm.demo_dir(__file__), "dask", "sklearn_cpu_training.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 9233f150f073..1447a0fb47e9 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -93,7 +93,7 @@ def generate_array(
 
 
 @pytest.mark.parametrize("to_frame", [True, False])
-def test_xgbclassifier_classes_type_and_value(to_frame: bool, client: "Client"):
+def test_xgbclassifier_classes_type_and_value(to_frame: bool, client: "Client") -> None:
     X, y = make_classification(n_samples=1000, n_features=4, random_state=123)
     if to_frame:
         import pandas as pd
@@ -213,7 +213,12 @@ def test_dask_sparse(client: "Client") -> None:
 
 
 def run_categorical(
-    client: "Client", tree_method: str, device: str, X, X_onehot, y
+    client: "Client",
+    tree_method: str,
+    device: str,
+    X: dd.DataFrame,
+    X_onehot: dd.DataFrame,
+    y: dd.Series,
 ) -> None:
     # Force onehot
     parameters = {
@@ -602,7 +607,7 @@ def test_dask_regressor(model: str, client: "Client") -> None:
 
 
 def run_dask_classifier(
-    X: dxgb._DaskCollection,
+    X: dxgb._DataT,
     y: dxgb._DaskCollection,
     w: dxgb._DaskCollection,
     model: str,
@@ -947,7 +952,7 @@ def test_auc(client: "Client") -> None:
 # No test for Exact, as empty DMatrix handling are mostly for distributed
 # environment and Exact doesn't support it.
 @pytest.mark.parametrize("tree_method", ["hist", "approx"])
-def test_empty_dmatrix(tree_method) -> None:
+def test_empty_dmatrix(tree_method: str) -> None:
     with LocalCluster(n_workers=kWorkers, dashboard_address=":0") as cluster:
         with Client(cluster) as client:
             parameters = {"tree_method": tree_method}
@@ -1200,7 +1205,7 @@ def test_dask_predict_leaf(booster: str, client: "Client") -> None:
     validate_leaf_output(leaf, num_parallel_tree)
 
 
-def test_dask_iteration_range(client: "Client"):
+def test_dask_iteration_range(client: "Client") -> None:
     X, y, _ = generate_array()
     n_rounds = 10
 
@@ -1233,10 +1238,12 @@ def test_dask_iteration_range(client: "Client"):
     np.testing.assert_allclose(full_predt.compute(), default.compute())
 
 
-def test_killed_task_wo_hang():
+def test_killed_task_wo_hang() -> None:
     # Test that aborting a worker doesn't lead to hang.
     class Eve(xgb.callback.TrainingCallback):
-        def after_iteration(self, model, epoch: int, evals_log) -> bool:
+        def after_iteration(
+            self, model: xgb.Booster, epoch: int, evals_log: Dict
+        ) -> bool:
             if coll.get_rank() == 1:
                 os.abort()
             return False
@@ -1409,7 +1416,7 @@ def run_updater_test(
         note(str(history))
         history = history["train"][dataset.metric]
 
-        def is_stump():
+        def is_stump() -> bool:
             return (
                 params.get("max_depth", None) == 1
                 or params.get("max_leaves", None) == 1
@@ -1961,7 +1968,9 @@ def test_parallel_submits(client: "Client") -> None:
 def run_tree_stats(client: Client, tree_method: str, device: str) -> str:
     """assert that different workers count dosn't affect summ statistic's on root"""
 
-    def dask_train(X, y, num_obs, num_features):
+    def dask_train(
+        X: np.ndarray, y: np.ndarray, num_obs: int, num_features: int
+    ) -> Dict[str, Any]:
         chunk_size = 100
         X = da.from_array(X, chunks=(chunk_size, num_features))
         y = da.from_array(y.reshape(num_obs, 1), chunks=(chunk_size, 1))
@@ -2064,7 +2073,7 @@ def test_init_estimation(client: Client) -> None:
 
 
 @pytest.mark.parametrize("tree_method", ["hist", "approx"])
-def test_uneven_nan(tree_method) -> None:
+def test_uneven_nan(tree_method: str) -> None:
     n_workers = 2
     with LocalCluster(n_workers=n_workers) as cluster:
         with Client(cluster) as client:
@@ -2144,7 +2153,9 @@ def test_early_stopping_custom_eval(self, client: "Client") -> None:
         X, y = da.from_array(X), da.from_array(y)
         m = dxgb.DaskDMatrix(client, X, y)
 
-        def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
+        def eval_error_metric(
+            predt: np.ndarray, dtrain: xgb.DMatrix
+        ) -> Tuple[str, float]:
             return tm.eval_error_metric(predt, dtrain, rev_link=False)
 
         valid = dxgb.DaskDMatrix(client, X, y)
@@ -2223,7 +2234,7 @@ def test_callback(self, client: "Client") -> None:
     allow_unclosed=True,
 )
 @pytest.mark.skip(reason="dmlc/xgboost#11405: test_worker_left is flaky")
-async def test_worker_left(c: Client, s: Scheduler, a: Worker, b: Worker):
+async def test_worker_left(c: Client, s: Scheduler, a: Worker, b: Worker) -> None:
     async with Worker(s.address):
         dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
         dy = da.random.random((1000,)).rechunk(chunks=(10,))
@@ -2249,7 +2260,7 @@ async def test_worker_left(c: Client, s: Scheduler, a: Worker, b: Worker):
     allow_unclosed=True,
 )
 @pytest.mark.skip
-async def test_worker_restarted(c, s, a, b):
+async def test_worker_restarted(c: Client, s: Scheduler, a: Nanny, b: Nanny) -> None:
     dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
     dy = da.random.random((1000,)).rechunk(chunks=(10,))
     d_train = await dxgb.DaskDMatrix(

From bd54840ad06f89195435f684a3101c8d3f72f733 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 20:30:05 +0800
Subject: [PATCH 219/224] Fixes for the latest mypy. (#11797)

---
 .../test_gpu_with_dask/test_gpu_with_dask.py  | 24 ++++++++-----------
 .../test_with_dask/test_with_dask.py          |  8 +++----
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 51527cb3fda1..d1ec4e4f7444 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -41,20 +41,16 @@
     tm.timeout(180),
 ]
 
-try:
-    import cudf
-    import dask
-    import dask.dataframe as dd
-    from dask import __version__ as dask_version
-    from dask import array as da
-    from dask.distributed import Client
-    from dask_cuda import LocalCUDACluster
-
-    from xgboost import dask as dxgb
-    from xgboost.testing.dask import check_init_estimation, check_uneven_nan
-except ImportError:
-    dask_version = None
-
+import cudf
+import dask
+import dask.dataframe as dd
+from dask import __version__ as dask_version
+from dask import array as da
+from dask.distributed import Client
+from dask_cuda import LocalCUDACluster
+
+from xgboost import dask as dxgb
+from xgboost.testing.dask import check_init_estimation, check_uneven_nan
 
 dask_version_ge110 = dask_version and parse_version(dask_version) >= parse_version(
     "2024.11.0"
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 1447a0fb47e9..aeab2a55cd73 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -2061,11 +2061,11 @@ def test_parallel_submit_multi_clients() -> None:
                 def _() -> dxgb.DaskXGBClassifier:
                     return futures[i][0].compute(futures[i][1]).result()
 
-                f = e.submit(_)
-                t_futures.append(f)
+                tf = e.submit(_)
+                t_futures.append(tf)
 
-        for i, f in enumerate(t_futures):
-            assert f.result().get_booster().num_boosted_rounds() == i + 1
+        for i, tf in enumerate(t_futures):
+            assert tf.result().get_booster().num_boosted_rounds() == i + 1
 
 
 def test_init_estimation(client: Client) -> None:

From ca7230f12a9f60699b163b0905d4c5eff70a76e2 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 11 Nov 2025 04:33:13 +0800
Subject: [PATCH 220/224] Fix ranking tests with weight norm. (#11800)

The test might fail if the objective cache is renewed. This PR makes sure the objective
and its cache is recreated for each test case, and multiple the normalization factor into
the expected results.
---
 tests/cpp/objective/test_lambdarank_obj.cc | 43 +++++++++++++---------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index c5a58093d73f..36dcab5bc065 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -56,14 +56,18 @@ void TestNDCGGPair(Context const* ctx) {
                             {0, 2, 4},
                             {2.06611f, -2.06611f, 0.0f, 0.0f},
                             {2.169331f, 2.169331f, 0.0f, 0.0f});
-
-    CheckRankingObjFunction(obj,
-                            {0, 0.1f, 0, 0.1f},
-                            {0,   1, 0, 1},
-                            {2.0f, 2.0f},
-                            {0, 2, 4},
-                            {2.06611f, -2.06611f, 2.06611f, -2.06611f},
-                            {2.169331f, 2.169331f, 2.169331f, 2.169331f});
+  }
+  {
+    std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+    obj->Configure(Args{{"lambdarank_pair_method", "topk"}});
+    float weight_norm = 0.5;  // n_groups / sum_weights
+    std::vector<float> out_grad{2.06611f, -2.06611f, 2.06611f, -2.06611f};
+    std::vector<float> out_hess{2.169331f, 2.169331f, 2.169331f, 2.169331f};
+    auto norm = [=](auto v) { return v * weight_norm; };
+    std::transform(out_grad.begin(), out_grad.end(), out_grad.begin(), norm);
+    std::transform(out_hess.begin(), out_hess.end(), out_hess.begin(), norm);
+    CheckRankingObjFunction(obj, {0, 0.1f, 0, 0.1f}, {0, 1, 0, 1}, {2.0f, 2.0f}, {0, 2, 4},
+                            out_grad, out_hess);
   }
 
   std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
@@ -320,8 +324,7 @@ TEST(LambdaRank, MAPStat) {
 
 void TestMAPGPair(Context const* ctx) {
   std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:map", ctx)};
-  Args args;
-  obj->Configure(args);
+  obj->Configure({});
 
   CheckConfigReload(obj, "rank:map");
 
@@ -332,14 +335,20 @@ void TestMAPGPair(Context const* ctx) {
                           {0, 2, 4},                                           // group
                           {1.2054923f, -1.2054923f, 1.2054923f, -1.2054923f},  // out grad
                           {1.2657166f, 1.2657166f, 1.2657166f, 1.2657166f});
+
+  obj.reset(xgboost::ObjFunction::Create("rank:map", ctx));
+  obj->Configure({});
+
   // disable the second query group with 0 weight
-  CheckRankingObjFunction(obj,                                  // obj
-                          {0, 0.1f, 0, 0.1f},                   // score
-                          {0, 1, 0, 1},                         // label
-                          {2.0f, 0.0f},                         // weight
-                          {0, 2, 4},                            // group
-                          {1.2054923f, -1.2054923f, .0f, .0f},  // out grad
-                          {1.2657166f, 1.2657166f, .0f, .0f});
+  auto w = 2.0f;  // weight for the first group
+  // weight norm is 1.0 (n_groups / sum_weights)
+  CheckRankingObjFunction(obj,                                          // obj
+                          {0, 0.1f, 0, 0.1f},                           // score
+                          {0, 1, 0, 1},                                 // label
+                          {w, 0.0f},                                    // weight
+                          {0, 2, 4},                                    // group
+                          {1.2054923f * w, -1.2054923f * w, .0f, .0f},  // out grad
+                          {1.2657166f * w, 1.2657166f * w, .0f, .0f});
 }
 
 TEST(LambdaRank, MAPGPair) {

From 73261fe6f1d3480fa1fa756b19711d494454c8a0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 11 Nov 2025 05:36:25 +0800
Subject: [PATCH 221/224] [mt] Add a new gradient type for the GPU hist.
 (#11798)

- Use reduced gradient for tree structure exploration.
- Expose the new gradient type through the objective interface.
---
 include/xgboost/gbm.h                         |   9 +-
 include/xgboost/gradient.h                    |  52 ++++++++
 include/xgboost/learner.h                     |  46 +++----
 include/xgboost/linalg.h                      |  13 +-
 include/xgboost/multi_target_tree_model.h     |   6 +-
 include/xgboost/objective.h                   |   2 +-
 include/xgboost/tree_model.h                  |  19 ++-
 include/xgboost/tree_updater.h                |  33 ++---
 plugin/sycl/tree/updater_quantile_hist.cc     |   5 +-
 plugin/sycl/tree/updater_quantile_hist.h      |  22 ++--
 python-package/xgboost/core.py                |  70 +++++++---
 python-package/xgboost/objective.py           |  49 +++++++
 python-package/xgboost/testing/__init__.py    |   2 +-
 .../xgboost/testing/multi_target.py           | 112 +++++++++++++++-
 src/c_api/c_api.cc                            |  51 +++++++-
 src/c_api/c_api.cu                            |   2 +-
 src/common/algorithm.h                        |   1 -
 src/common/device_debug.cuh                   |  41 ++++++
 src/common/device_helpers.cuh                 |  12 --
 src/gbm/gblinear.cc                           |   9 +-
 src/gbm/gbtree.cc                             |  41 ++++--
 src/gbm/gbtree.h                              |   6 +-
 src/learner.cc                                |  28 ++--
 src/objective/adaptive.cc                     |   4 +-
 src/objective/adaptive.cu                     |   2 +-
 src/objective/adaptive.h                      |   6 +-
 src/objective/quantile_obj.cu                 |   2 +-
 src/objective/regression_obj.cu               |   2 +-
 src/tree/gpu_hist/histogram.cu                |   1 +
 src/tree/gpu_hist/leaf_sum.cu                 |  76 +++++++++++
 src/tree/gpu_hist/leaf_sum.cuh                |  37 ++++++
 src/tree/gpu_hist/multi_evaluate_splits.cu    |  59 +++++----
 src/tree/gpu_hist/multi_evaluate_splits.cuh   |   8 +-
 src/tree/gpu_hist/row_partitioner.cuh         |  21 ++-
 src/tree/multi_target_tree_model.cc           |  35 +++--
 src/tree/tree_model.cc                        |   5 +
 src/tree/updater_approx.cc                    |   8 +-
 src/tree/updater_colmaker.cc                  |   8 +-
 src/tree/updater_gpu_hist.cu                  |  41 ++++--
 src/tree/updater_gpu_hist.cuh                 | 121 ++++++++++++++----
 src/tree/updater_prune.cc                     |   8 +-
 src/tree/updater_quantile_hist.cc             |   7 +-
 src/tree/updater_refresh.cc                   |   9 +-
 src/tree/updater_sync.cc                      |   5 +-
 tests/cpp/gbm/test_gbtree.cc                  |   8 +-
 tests/cpp/helpers.cc                          |   5 +-
 tests/cpp/helpers.h                           |  11 +-
 tests/cpp/predictor/test_cpu_predictor.cc     |   5 +-
 tests/cpp/tree/gpu_hist/dummy_quantizer.cuh   |  22 ++++
 tests/cpp/tree/gpu_hist/test_leaf_sum.cu      |  47 +++++++
 .../gpu_hist/test_multi_evaluate_splits.cu    |   4 +-
 .../cpp/tree/gpu_hist/test_multi_histogram.cu |   9 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc   |   2 +-
 tests/cpp/tree/test_approx.cc                 |   8 +-
 tests/cpp/tree/test_gpu_approx.cu             |   6 +-
 tests/cpp/tree/test_gpu_hist.cu               |  22 ++--
 tests/cpp/tree/test_prune.cc                  |   8 +-
 tests/cpp/tree/test_quantile_hist.cc          |  13 +-
 tests/cpp/tree/test_refresh.cc                |  18 ++-
 tests/cpp/tree/test_tree_stat.cc              |  16 +--
 tests/python-gpu/test_gpu_multi_target.py     |  10 +-
 61 files changed, 1006 insertions(+), 304 deletions(-)
 create mode 100644 include/xgboost/gradient.h
 create mode 100644 python-package/xgboost/objective.py
 create mode 100644 src/common/device_debug.cuh
 create mode 100644 src/tree/gpu_hist/leaf_sum.cu
 create mode 100644 src/tree/gpu_hist/leaf_sum.cuh
 create mode 100644 tests/cpp/tree/gpu_hist/dummy_quantizer.cuh
 create mode 100644 tests/cpp/tree/gpu_hist/test_leaf_sum.cu

diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 9a74594ae8cc..65940773ffee 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -11,13 +11,14 @@
 #include <dmlc/registry.h>
 #include <xgboost/base.h>
 #include <xgboost/data.h>
+#include <xgboost/gradient.h>  // for GradientContainer
 #include <xgboost/host_device_vector.h>
 #include <xgboost/model.h>
 
-#include <vector>
-#include <string>
 #include <functional>
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace xgboost {
 
@@ -78,8 +79,8 @@ class GradientBooster : public Model, public Configurable {
    *                   the booster may change content of gpair
    * @param obj The objective function used for boosting.
    */
-  virtual void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
-                       PredictionCacheEntry*, ObjFunction const* obj) = 0;
+  virtual void DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair,
+                       PredictionCacheEntry* prediction, ObjFunction const* obj) = 0;
 
   /**
    * \brief Generate predictions for given feature matrix
diff --git a/include/xgboost/gradient.h b/include/xgboost/gradient.h
new file mode 100644
index 000000000000..da4ffe9741ec
--- /dev/null
+++ b/include/xgboost/gradient.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#pragma once
+
+#include <xgboost/base.h>    // for GradientPair
+#include <xgboost/linalg.h>  // for Matrix
+#include <xgboost/logging.h>
+
+#include <cstddef>  // for size_t
+
+namespace xgboost {
+/**
+ * @brief Container for gradient produced by objective.
+ */
+struct GradientContainer {
+  /** @brief Gradient used for multi-target tree split and linear model. */
+  linalg::Matrix<GradientPair> gpair;
+  /** @brief Gradient used for tree leaf value, optional. */
+  linalg::Matrix<GradientPair> value_gpair;
+
+  [[nodiscard]] bool HasValueGrad() const noexcept { return !value_gpair.Empty(); }
+
+  [[nodiscard]] std::size_t NumSplitTargets() const noexcept { return gpair.Shape(1); }
+  [[nodiscard]] std::size_t NumTargets() const noexcept {
+    return HasValueGrad() ? value_gpair.Shape(1) : this->gpair.Shape(1);
+  }
+
+  linalg::MatrixView<GradientPair const> ValueGrad(Context const* ctx) const {
+    if (HasValueGrad()) {
+      return this->value_gpair.View(ctx->Device());
+    }
+    return this->gpair.View(ctx->Device());
+  }
+
+  [[nodiscard]] linalg::Matrix<GradientPair> const* Grad() const { return &gpair; }
+  [[nodiscard]] linalg::Matrix<GradientPair>* Grad() { return &gpair; }
+
+  [[nodiscard]] linalg::Matrix<GradientPair> const* FullGradOnly() const {
+    if (this->HasValueGrad()) {
+      LOG(FATAL) << "Reduced gradient is not yet supported.";
+    }
+    return this->Grad();
+  }
+  [[nodiscard]] linalg::Matrix<GradientPair>* FullGradOnly() {
+    if (this->HasValueGrad()) {
+      LOG(FATAL) << "Reduced gradient is not yet supported.";
+    }
+    return this->Grad();
+  }
+};
+}  // namespace xgboost
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 24ff72f77d8d..ffaddfbe6442 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -8,22 +8,23 @@
 #ifndef XGBOOST_LEARNER_H_
 #define XGBOOST_LEARNER_H_
 
-#include <dmlc/io.h>          // for Serializable
-#include <xgboost/base.h>     // for bst_feature_t, bst_target_t, bst_float, Args, GradientPair, ..
-#include <xgboost/context.h>  // for Context
-#include <xgboost/linalg.h>   // for Vector, VectorView
-#include <xgboost/metric.h>   // for Metric
-#include <xgboost/model.h>    // for Configurable, Model
-#include <xgboost/span.h>     // for Span
-#include <xgboost/task.h>     // for ObjInfo
+#include <dmlc/io.h>           // for Serializable
+#include <xgboost/base.h>      // for bst_feature_t, bst_target_t, bst_float, Args, GradientPair, ..
+#include <xgboost/context.h>   // for Context
+#include <xgboost/gradient.h>  // for GradientContainer
+#include <xgboost/linalg.h>    // for Vector, VectorView
+#include <xgboost/metric.h>    // for Metric
+#include <xgboost/model.h>     // for Configurable, Model
+#include <xgboost/span.h>      // for Span
+#include <xgboost/task.h>      // for ObjInfo
 
-#include <algorithm>          // for max
-#include <cstdint>            // for int32_t, uint32_t, uint8_t
-#include <map>                // for map
-#include <memory>             // for shared_ptr, unique_ptr
-#include <string>             // for string
-#include <utility>            // for move
-#include <vector>             // for vector
+#include <algorithm>  // for max
+#include <cstdint>    // for int32_t, uint32_t, uint8_t
+#include <map>        // for map
+#include <memory>     // for shared_ptr, unique_ptr
+#include <string>     // for string
+#include <utility>    // for move
+#include <vector>     // for vector
 
 namespace xgboost {
 class FeatureMap;
@@ -47,25 +48,24 @@ enum class PredictionType : std::uint8_t {  // NOLINT
   kLeaf = 6
 };
 
-/*!
- * \brief Learner class that does training and prediction.
+/**
+ * @brief Learner class that does training and prediction.
  *  This is the user facing module of xgboost training.
  *  The Load/Save function corresponds to the model used in python/R.
- *  \code
+ *  @code
  *
- *  std::unique_ptr<Learner> learner(new Learner::Create(cache_mats));
- *  learner.Configure(configs);
+ *  std::unique_ptr<Learner> learner{Learner::Create(cache_mats)};
+ *  learner->Configure(configs);
  *
  *  for (int iter = 0; iter < max_iter; ++iter) {
  *    learner->UpdateOneIter(iter, train_mat);
  *    LOG(INFO) << learner->EvalOneIter(iter, data_sets, data_names);
  *  }
  *
- *  \endcode
+ *  @endcode
  */
 class Learner : public Model, public Configurable, public dmlc::Serializable {
  public:
-  /*! \brief virtual destructor */
   ~Learner() override;
   /*!
    * \brief Configure Learner based on set parameters.
@@ -88,7 +88,7 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * @param in_gpair The input gradient statistics.
    */
   virtual void BoostOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train,
-                            linalg::Matrix<GradientPair>* in_gpair) = 0;
+                            GradientContainer* in_gpair) = 0;
   /*!
    * \brief evaluate the model for specific iteration using the configured metrics.
    * \param iter iteration number
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 23a47dea45fe..2ce60ffdcb68 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -957,7 +957,7 @@ template <typename T>
 using Vector = Tensor<T, 1>;
 
 /**
- * \brief Create an array without initialization.
+ * @brief Create an array without initialization.
  */
 template <typename T, typename... Index>
 auto Empty(Context const *ctx, Index &&...index) {
@@ -967,6 +967,17 @@ auto Empty(Context const *ctx, Index &&...index) {
   return t;
 }
 
+/**
+ * @brief Create an array with the same shape and dtype as the input.
+ */
+template <typename T, std::int32_t kDim>
+auto EmptyLike(Context const *ctx, Tensor<T, kDim> const &in) {
+  Tensor<T, kDim> t;
+  t.SetDevice(ctx->Device());
+  t.Reshape(in.Shape());
+  return t;
+}
+
 /**
  * \brief Create an array with value v.
  */
diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index 2fc110f02b73..ca0fa716284c 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -60,9 +60,9 @@ class MultiTargetTree : public Model {
   MultiTargetTree& operator=(MultiTargetTree&& that) = delete;
 
   /**
-   * @brief Set the weight for a leaf.
+   * @brief Set the weight for the root.
    */
-  void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight);
+  void SetRoot(linalg::VectorView<float const> weight);
   /**
    * @brief Expand a leaf into split node.
    */
@@ -70,6 +70,8 @@ class MultiTargetTree : public Model {
               linalg::VectorView<float const> base_weight,
               linalg::VectorView<float const> left_weight,
               linalg::VectorView<float const> right_weight);
+  /** @see RegTree::SetLeaves */
+  void SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights);
 
   [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
     return left_.ConstHostVector()[nidx] == InvalidNodeId();
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index 624218e22123..497821590bc9 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -129,7 +129,7 @@ class ObjFunction : public Configurable {
   virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& /*position*/,
                               MetaInfo const& /*info*/, float /*learning_rate*/,
                               HostDeviceVector<float> const& /*prediction*/,
-                              std::int32_t /*group_idx*/, RegTree* /*p_tree*/) const {}
+                              bst_target_t /*group_idx*/, RegTree* /*p_tree*/) const {}
   /**
    * @brief Create an objective function according to the name.
    *
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 18656ac23b59..bc8d4ade6d76 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -321,12 +321,23 @@ class RegTree : public Model {
                   float right_sum,
                   bst_node_t leaf_right_child = kInvalidNodeId);
   /**
-   * \brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
+   * @brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
    */
   void ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, bool default_left,
                   linalg::VectorView<float const> base_weight,
                   linalg::VectorView<float const> left_weight,
                   linalg::VectorView<float const> right_weight);
+  /**
+   * @brief Set all leaf weights for a multi-target tree.
+   *
+   * The leaf weight can be different from the internal weight stored by @ref ExpandNode
+   * This function is used to set the leaf at the end of tree construction.
+   *
+   * @param leaves  The node indices for all leaves. This must contain all the leaves in this tree.
+   * @param weights Row-major matrix for leaf weights, each row contains a leaf specified by the
+   *                leaves parameter.
+   */
+  void SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights);
 
   /**
    * \brief Expands a leaf node with categories
@@ -396,11 +407,11 @@ class RegTree : public Model {
    */
   [[nodiscard]] bst_node_t GetDepth(bst_node_t nidx) const;
   /**
-   * @brief Set the leaf weight for a multi-target tree.
+   * @brief Set the root weight for a multi-target tree.
    */
-  void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
+  void SetRoot(linalg::VectorView<float const> weight) {
     CHECK(IsMultiTarget());
-    return this->p_mt_tree_->SetLeaf(nidx, weight);
+    return this->p_mt_tree_->SetRoot(weight);
   }
   /**
    * @brief Get the maximum depth.
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
index 477c8e4a1785..7a96d71c5231 100644
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -1,7 +1,7 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
- * \file tree_updater.h
- * \brief General primitive for tree learning,
+ * Copyright 2014-2025, XGBoost Contributors
+ *
+ * @brief General primitive for tree learning,
  *   Updating a collection of trees given the information.
  * \author Tianqi Chen
  */
@@ -10,16 +10,17 @@
 
 #include <dmlc/registry.h>
 #include <xgboost/base.h>                // for Args, GradientPair
-#include <xgboost/data.h>                // DMatrix
+#include <xgboost/data.h>                // for DMatrix
+#include <xgboost/gradient.h>            // for GradientContainer
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/linalg.h>              // for VectorView
 #include <xgboost/model.h>               // for Configurable
 #include <xgboost/span.h>                // for Span
 #include <xgboost/tree_model.h>          // for RegTree
 
-#include <functional>                    // for function
-#include <string>                        // for string
-#include <vector>                        // for vector
+#include <functional>  // for function
+#include <string>      // for string
+#include <vector>      // for vector
 
 namespace xgboost {
 namespace tree {
@@ -59,21 +60,21 @@ class TreeUpdater : public Configurable {
    */
   [[nodiscard]] virtual bool HasNodePosition() const { return false; }
   /**
-   * \brief perform update to the tree models
+   * @brief perform update to the tree models
    *
-   * \param param Hyper-parameter for constructing trees.
-   * \param gpair the gradient pair statistics of the data
-   * \param data The data matrix passed to the updater.
-   * \param out_position The leaf index for each row.  The index is negated if that row is
+   * @param param  Hyper-parameter for constructing trees.
+   * @param gpair  The gradient pair statistics of the data
+   * @param p_fmat The data matrix passed to the updater.
+   * @param out_position The leaf index for each row.  The index is negated if that row is
    *                     removed during sampling. So the 3th node is ~3.
-   * \param out_trees references the trees to be updated, updater will change the content of trees
+   * @param out_trees references the trees to be updated, updater will change the content of trees
    *   note: all the trees in the vector are updated, with the same statistics,
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
    */
-  virtual void Update(tree::TrainParam const* param, linalg::Matrix<GradientPair>* gpair,
-                      DMatrix* data, common::Span<HostDeviceVector<bst_node_t>> out_position,
-                      const std::vector<RegTree*>& out_trees) = 0;
+  virtual void Update(tree::TrainParam const* param, GradientContainer* gpair, DMatrix* p_fmat,
+                      common::Span<HostDeviceVector<bst_node_t>> out_position,
+                      std::vector<RegTree*> const& out_trees) = 0;
 
   /*!
    * \brief determines whether updater has enough knowledge about a given dataset
diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
index a8fe602e6399..5aa89f6222a1 100644
--- a/plugin/sycl/tree/updater_quantile_hist.cc
+++ b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -8,6 +8,7 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wtautological-constant-compare"
 #pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/tree_updater.h"
 #pragma GCC diagnostic pop
 
@@ -72,11 +73,11 @@ void QuantileHistMaker::CallUpdate(
   }
 }
 
-void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param,
-                               linalg::Matrix<GradientPair>* gpair,
+void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param, GradientContainer *in_gpair,
                                DMatrix *dmat,
                                xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
                                const std::vector<RegTree *> &trees) {
+  auto gpair = in_gpair->FullGradOnly();
   gpair->Data()->SetDevice(ctx_->Device());
   if (dmat != p_last_dmat_ || is_gmat_initialized_ == false) {
     updater_monitor_.Start("GmatInitialization");
diff --git a/plugin/sycl/tree/updater_quantile_hist.h b/plugin/sycl/tree/updater_quantile_hist.h
index e60153fa7d32..d89b07d80ccf 100644
--- a/plugin/sycl/tree/updater_quantile_hist.h
+++ b/plugin/sycl/tree/updater_quantile_hist.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2024 by Contributors
+/**
+ * Copyright 2017-2025, XGBoost Contributors
  * \file updater_quantile_hist.h
  */
 #ifndef PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
@@ -8,21 +8,21 @@
 #include <dmlc/timer.h>
 #include <xgboost/tree_updater.h>
 
-#include <vector>
 #include <memory>
+#include <vector>
 
-#include "../data/gradient_index.h"
+#include "../../src/common/random.h"
+#include "../../src/tree/constraints.h"
 #include "../common/hist_util.h"
-#include "../common/row_set.h"
 #include "../common/partition_builder.h"
-#include "split_evaluator.h"
+#include "../common/row_set.h"
+#include "../data/gradient_index.h"
 #include "../device_manager.h"
 #include "hist_updater.h"
+#include "split_evaluator.h"
 #include "xgboost/data.h"
-
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
-#include "../../src/tree/constraints.h"
-#include "../../src/common/random.h"
 
 namespace xgboost {
 namespace sycl {
@@ -48,9 +48,7 @@ class QuantileHistMaker: public TreeUpdater {
   }
   void Configure(const Args& args) override;
 
-  void Update(xgboost::tree::TrainParam const *param,
-              linalg::Matrix<GradientPair>* gpair,
-              DMatrix* dmat,
+  void Update(xgboost::tree::TrainParam const* param, GradientContainer* in_gpair, DMatrix* dmat,
               xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override;
 
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index acfa6e8bd3bf..58aef079a839 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -77,6 +77,7 @@
     py_str,
 )
 from .libpath import find_lib_path
+from .objective import TreeObjective
 
 if TYPE_CHECKING:
     from pandas import DataFrame as PdDataFrame
@@ -1966,7 +1967,7 @@ def __init__(
         cache = cache if cache is not None else []
         for d in cache:
             if not isinstance(d, DMatrix):
-                raise TypeError(f"invalid cache item: {type(d).__name__}", cache)
+                raise TypeError(f"Invalid cache item: {type(d).__name__}", cache)
 
         dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
         self.handle: Optional[ctypes.c_void_p] = ctypes.c_void_p()
@@ -2068,7 +2069,7 @@ def __del__(self) -> None:
             self.handle = None
 
     def __getstate__(self) -> Dict:
-        # can't pickle ctypes pointers, put model content in bytearray
+        # can't pickle ctypes pointers, put model content in a bytearray
         this = self.__dict__.copy()
         handle = this["handle"]
         if handle is not None:
@@ -2084,7 +2085,7 @@ def __getstate__(self) -> Dict:
         return this
 
     def __setstate__(self, state: Dict) -> None:
-        # reconstruct handle from raw data
+        # reconstruct the handle from raw data
         handle = state["handle"]
         if handle is not None:
             buf = handle
@@ -2385,7 +2386,10 @@ def set_param(
                 )
 
     def update(
-        self, dtrain: DMatrix, iteration: int, fobj: Optional[Objective] = None
+        self,
+        dtrain: DMatrix,
+        iteration: int,
+        fobj: Optional[Objective] = None,
     ) -> None:
         """Update for one iteration, with objective function calculated
         internally.  This function should not be called directly by users.
@@ -2401,7 +2405,7 @@ def update(
 
         """
         if not isinstance(dtrain, DMatrix):
-            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
+            raise TypeError(f"Invalid training matrix: {type(dtrain).__name__}")
         self._assign_dmatrix_features(dtrain)
 
         if fobj is None:
@@ -2412,11 +2416,31 @@ def update(
             )
         else:
             pred = self.predict(dtrain, output_margin=True, training=True)
-            grad, hess = fobj(pred, dtrain)
-            self.boost(dtrain, iteration=iteration, grad=grad, hess=hess)
+            vgrad: Optional[ArrayLike]
+            vhess: Optional[ArrayLike]
+            vgrad, vhess = fobj(pred, dtrain)
+            if isinstance(fobj, TreeObjective):
+                sgrad, shess = fobj.split_grad(vgrad, vhess)
+            else:
+                sgrad, shess = vgrad, vhess
+                vgrad, vhess = None, None
+            self.boost(
+                dtrain,
+                iteration=iteration,
+                grad=sgrad,
+                hess=shess,
+                _vgrad=vgrad,
+                _vhess=vhess,
+            )
 
     def boost(
-        self, dtrain: DMatrix, iteration: int, grad: NumpyOrCupy, hess: NumpyOrCupy
+        self,
+        dtrain: DMatrix,
+        iteration: int,
+        grad: NumpyOrCupy,
+        hess: NumpyOrCupy,
+        _vgrad: Optional[NumpyOrCupy] = None,  # WIP vector-leaf support
+        _vhess: Optional[NumpyOrCupy] = None,  # WIP vector-leaf support
     ) -> None:
         """Boost the booster for one iteration with customized gradient statistics.
         Like :py:func:`xgboost.Booster.update`, this function should not be called
@@ -2467,15 +2491,29 @@ def grad_arrinf(array: NumpyOrCupy) -> bytes:
 
             return interface
 
-        _check_call(
-            _LIB.XGBoosterTrainOneIter(
-                self.handle,
-                dtrain.handle,
-                iteration,
-                grad_arrinf(grad),
-                grad_arrinf(hess),
+        if _vgrad is not None or _vhess is not None:
+            assert _vhess is not None and _vgrad is not None
+            _check_call(
+                _LIB.XGBoosterTrainOneIterWithSplitGrad(
+                    self.handle,
+                    dtrain.handle,
+                    iteration,
+                    grad_arrinf(grad),
+                    grad_arrinf(hess),
+                    grad_arrinf(_vgrad),
+                    grad_arrinf(_vhess),
+                )
+            )
+        else:
+            _check_call(
+                _LIB.XGBoosterTrainOneIter(
+                    self.handle,
+                    dtrain.handle,
+                    iteration,
+                    grad_arrinf(grad),
+                    grad_arrinf(hess),
+                )
             )
-        )
 
     def eval_set(
         self,
diff --git a/python-package/xgboost/objective.py b/python-package/xgboost/objective.py
new file mode 100644
index 000000000000..2a05e15339e3
--- /dev/null
+++ b/python-package/xgboost/objective.py
@@ -0,0 +1,49 @@
+"""Experimental support for a new objective interface with target dimension
+reduction.
+
+.. warning::
+
+  Do not use this module unless you want to participate in development.
+
+.. versionadded:: 3.2.0
+
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Tuple
+
+from ._typing import ArrayLike
+
+if TYPE_CHECKING:
+    from .core import DMatrix
+
+
+class Objective(ABC):
+    """Base class for custom objective functions.
+
+    .. warning::
+
+        Do not use this class unless you want to participate in development.
+
+    """
+
+    @abstractmethod
+    def __call__(
+        self, y_pred: ArrayLike, dtrain: "DMatrix"
+    ) -> Tuple[ArrayLike, ArrayLike]: ...
+
+
+class TreeObjective(Objective):
+    """Base class for tree-specific custom objective functions.
+
+    .. warning::
+
+        Do not use this class unless you want to participate in development.
+
+    """
+
+    def split_grad(
+        self, grad: ArrayLike, hess: ArrayLike
+    ) -> Tuple[ArrayLike, ArrayLike]:
+        """Provide a different gradient type for finding tree structures."""
+        return grad, hess
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 765d6ff0443f..85804d9434e6 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -621,7 +621,7 @@ def ls_obj(
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Least squared error."""
     grad = y_pred - y_true
-    hess = np.ones(len(y_true))
+    hess = np.ones(grad.shape)
     if sample_weight is not None:
         grad *= sample_weight
         hess *= sample_weight
diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
index 4558916fb0b4..ccfa5a83cb1c 100644
--- a/python-package/xgboost/testing/multi_target.py
+++ b/python-package/xgboost/testing/multi_target.py
@@ -1,19 +1,31 @@
 """Tests for multi-target training."""
 
-from typing import Optional
+from typing import Dict, Optional, Tuple
 
-from sklearn.datasets import make_classification, make_multilabel_classification
+import numpy as np
+import pytest
+from sklearn.datasets import (
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
 
 import xgboost.testing as tm
 
+from .._typing import ArrayLike
+from ..core import Booster, DMatrix, QuantileDMatrix
+from ..objective import Objective, TreeObjective
 from ..sklearn import XGBClassifier
+from ..training import train
 from .updater import ResetStrategy
 from .utils import Device
 
 
 def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
     """Use vector leaf for multi-class models."""
-    X, y = make_classification(128, n_features=12, n_informative=10, n_classes=4)
+    X, y = make_classification(
+        128, n_features=12, n_informative=10, n_classes=4, random_state=2025
+    )
     clf = XGBClassifier(
         multi_strategy="multi_output_tree",
         callbacks=[ResetStrategy()],
@@ -24,6 +36,8 @@ def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
     clf.fit(X, y, eval_set=[(X, y)])
     assert clf.objective == "multi:softprob"
     assert tm.non_increasing(clf.evals_result()["validation_0"]["mlogloss"])
+    if learning_rate is not None and abs(learning_rate - 1.0) < 1e-5:
+        assert clf.evals_result()["validation_0"]["mlogloss"][-1] < 0.045
 
     proba = clf.predict_proba(X)
     assert proba.shape == (y.shape[0], 4)
@@ -31,7 +45,8 @@ def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
 
 def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
     """Use vector leaf for multi-label classification models."""
-    X, y = make_multilabel_classification(128)
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_multilabel_classification(128, random_state=2025)
     clf = XGBClassifier(
         multi_strategy="multi_output_tree",
         callbacks=[ResetStrategy()],
@@ -42,6 +57,95 @@ def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
     clf.fit(X, y, eval_set=[(X, y)])
     assert clf.objective == "binary:logistic"
     assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
+    if learning_rate is not None and abs(learning_rate - 1.0) < 1e-5:
+        assert clf.evals_result()["validation_0"]["logloss"][-1] < 0.065
 
     proba = clf.predict_proba(X)
     assert proba.shape == y.shape
+
+
+def run_reduced_grad(device: Device) -> None:
+    """Basic test for using reduced gradient for tree splits."""
+    import cupy as cp
+
+    class LsObj0(TreeObjective):
+        """Split grad is the same as value grad."""
+
+        def __call__(
+            self, y_pred: ArrayLike, dtrain: DMatrix
+        ) -> Tuple[cp.ndarray, cp.ndarray]:
+            y_true = dtrain.get_label().reshape(y_pred.shape)
+            grad, hess = tm.ls_obj(y_true, y_pred, None)
+            return cp.array(grad), cp.array(hess)
+
+        def split_grad(
+            self, grad: ArrayLike, hess: ArrayLike
+        ) -> Tuple[ArrayLike, ArrayLike]:
+            return cp.array(grad), cp.array(hess)
+
+    class LsObj1(Objective):
+        """No split grad."""
+
+        def __call__(
+            self, y_pred: ArrayLike, dtrain: DMatrix
+        ) -> Tuple[cp.ndarray, cp.ndarray]:
+            y_true = dtrain.get_label().reshape(y_pred.shape)
+            grad, hess = tm.ls_obj(y_true, y_pred, None)
+            return cp.array(grad), cp.array(hess)
+
+    X, y = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
+        n_samples=1024, n_features=16, random_state=1994, n_targets=5
+    )
+    Xy = QuantileDMatrix(X, y)
+
+    def run_test(
+        obj: Optional[Objective], base_score: Optional[list[float]] = None
+    ) -> Booster:
+        evals_result: Dict[str, Dict] = {}
+        booster = train(
+            {
+                "device": device,
+                "multi_strategy": "multi_output_tree",
+                "learning_rate": 1,
+                "base_score": base_score,
+            },
+            Xy,
+            evals=[(Xy, "Train")],
+            obj=obj,
+            num_boost_round=8,
+            evals_result=evals_result,
+        )
+        assert tm.non_increasing(evals_result["Train"]["rmse"])
+        return booster
+
+    booster_0 = run_test(LsObj0())
+    booster_1 = run_test(LsObj1())
+    np.testing.assert_allclose(
+        booster_0.inplace_predict(X), booster_1.inplace_predict(X)
+    )
+
+    booster_2 = run_test(LsObj0(), [0.5] * y.shape[1])
+    booster_3 = run_test(None, [0.5] * y.shape[1])
+    np.testing.assert_allclose(
+        booster_2.inplace_predict(X), booster_3.inplace_predict(X)
+    )
+
+    # Use mean gradient, should still converge.
+    class LsObj2(LsObj0):
+        """Use mean as split grad."""
+
+        def __init__(self, check_used: bool):
+            self._chk = check_used
+
+        def split_grad(
+            self, grad: ArrayLike, hess: ArrayLike
+        ) -> Tuple[cp.ndarray, cp.ndarray]:
+            if self._chk:
+                assert False
+            sgrad = cp.mean(grad, axis=1)
+            shess = cp.mean(hess, axis=1)
+            return sgrad, shess
+
+    run_test(LsObj2(False))
+    with pytest.raises(AssertionError):
+        run_test(LsObj2(True))
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 074d1ab76e4f..bb552276f0d8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1205,7 +1205,7 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
 
 namespace xgboost {
 // copy user-supplied CUDA gradient arrays
-void CopyGradientFromCUDAArrays(Context const *, ArrayInterface<2, false> const &,
+void CopyGradientFromCudaArrays(Context const *, ArrayInterface<2, false> const &,
                                 ArrayInterface<2, false> const &, linalg::Matrix<GradientPair> *)
 #if !defined(XGBOOST_USE_CUDA)
 {
@@ -1228,7 +1228,7 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
   StringView msg{"Mismatched shape between the gradient and hessian."};
   CHECK_EQ(i_grad.Shape<0>(), i_hess.Shape<0>()) << msg;
   CHECK_EQ(i_grad.Shape<1>(), i_hess.Shape<1>()) << msg;
-  linalg::Matrix<GradientPair> gpair;
+  GradientContainer gpair;
   auto grad_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_grad.data);
   auto hess_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_hess.data);
   CHECK_EQ(i_grad.Shape<0>(), p_fmat->Info().num_row_)
@@ -1237,8 +1237,8 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
   auto *learner = static_cast<Learner *>(handle);
   auto ctx = learner->Ctx();
   if (!grad_is_cuda) {
-    gpair.Reshape(i_grad.Shape<0>(), i_grad.Shape<1>());
-    auto h_gpair = gpair.HostView();
+    gpair.gpair.Reshape(i_grad.Shape<0>(), i_grad.Shape<1>());
+    auto h_gpair = gpair.gpair.HostView();
     DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) {
       DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) {
         common::ParallelFor(h_gpair.Size(), ctx->Threads(),
@@ -1246,12 +1246,53 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
       });
     });
   } else {
-    CopyGradientFromCUDAArrays(ctx, i_grad, i_hess, &gpair);
+    CopyGradientFromCudaArrays(ctx, i_grad, i_hess, &gpair.gpair);
   }
   learner->BoostOneIter(iter, p_fmat, &gpair);
   API_END();
 }
 
+typedef char const *JArrayStr;  // NOLINT
+
+// Hidden, working-in-progress support for reduced gradient. CUDA-only at the moment.
+/**
+ * @brief Use a different type of gradient for tree split.
+ *
+ * @param split_grad Gradient for finding tree splits.
+ * @param split_hess Hessian for finding tree splits.
+ * @param value_grad Gradient for calculating tree leaf weights.
+ * @param value_hess Hessian for calculating tree leaf weights.
+ */
+XGB_DLL int XGBoosterTrainOneIterWithSplitGrad(BoosterHandle handle, DMatrixHandle dtrain, int iter,
+                                               JArrayStr split_grad, JArrayStr split_hess,
+                                               JArrayStr value_grad, JArrayStr value_hess) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto *learner = static_cast<Learner *>(handle);
+  GradientContainer gpair;
+  auto ctx = learner->Ctx();
+  CHECK(ctx->IsCUDA()) << "Reduced gradient with CPU" << MTNotImplemented();
+  {
+    ArrayInterface<2, false> i_grad{StringView{split_grad}};
+    ArrayInterface<2, false> i_hess{StringView{split_hess}};
+    CHECK(ArrayInterfaceHandler::IsCudaPtr(i_grad.data))
+        << "Reduced gradient with CPU" << MTNotImplemented();
+    CopyGradientFromCudaArrays(ctx, i_grad, i_hess, &gpair.gpair);
+  }
+  {
+    ArrayInterface<2, false> i_grad{StringView{value_grad}};
+    ArrayInterface<2, false> i_hess{StringView{value_hess}};
+    CHECK(ArrayInterfaceHandler::IsCudaPtr(i_grad.data))
+        << "Reduced gradient with CPU" << MTNotImplemented();
+    CopyGradientFromCudaArrays(ctx, i_grad, i_hess, &gpair.value_gpair);
+  }
+
+  auto p_fmat = CastDMatrixHandle(dtrain);
+  learner->BoostOneIter(iter, p_fmat, &gpair);
+
+  API_END();
+}
+
 XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle,
                                  int iter,
                                  DMatrixHandle dmats[],
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 4a0d02107c21..999d3dfb36d5 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -84,7 +84,7 @@ void XGBoostAPIGuard::RestoreGPUAttribute() {
   cudaSetDevice(device_id_);
 }
 
-void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
+void CopyGradientFromCudaArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
                                 ArrayInterface<2, false> const &hess,
                                 linalg::Matrix<GradientPair> *out_gpair) {
   auto grad_dev = dh::CudaGetPointerDevice(grad.data);
diff --git a/src/common/algorithm.h b/src/common/algorithm.h
index 19afaf3cc51e..10d23d05cc08 100644
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -78,7 +78,6 @@ void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
 template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
           typename Comp = std::less<V>>
 std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
-  CHECK(!ctx->IsCUDA());
   auto n = std::distance(begin, end);
   std::vector<Idx> result(n);
   Iota(ctx, result.begin(), result.end(), 0);
diff --git a/src/common/device_debug.cuh b/src/common/device_debug.cuh
new file mode 100644
index 000000000000..6a2dfd285ea4
--- /dev/null
+++ b/src/common/device_debug.cuh
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#pragma once
+
+#include <cstddef>   // for size_t
+#include <iostream>  // for cout
+#include <vector>    // for vector
+
+#include "common.h"
+#include "device_helpers.cuh"     // for CopyDeviceSpanToVector
+#include "xgboost/span.h"         // for Span
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::debug {
+// debug::SyncDevice(__FILE__, __LINE__);
+inline void SyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) {
+  {
+    auto err = cudaDeviceSynchronize();
+    dh::ThrowOnCudaError(err, file, line);
+  }
+  {
+    auto err = cudaGetLastError();
+    dh::ThrowOnCudaError(err, file, line);
+  }
+}
+
+template <typename T>
+void PrintDeviceSpan(common::Span<T> values, StringView name) {
+  std::cout << name << std::endl;
+  std::vector<std::remove_cv_t<T>> h_values(values.size());
+  dh::CopyDeviceSpanToVector(&h_values, values);
+  for (std::size_t i = 0; i < values.size(); ++i) {
+    if (i != 0 && i % 16 == 0) {
+      std::cout << std::endl;
+    }
+    std::cout << h_values[i] << ", ";
+  }
+  std::cout << std::endl;
+}
+}  // namespace xgboost::debug
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 8cc936419856..dbea513ee4ca 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -257,18 +257,6 @@ void Iota(Container array, cudaStream_t stream) {
   LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }
 
-// dh::DebugSyncDevice(__FILE__, __LINE__);
-inline void DebugSyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) {
-  {
-    auto err = cudaDeviceSynchronize();
-    ThrowOnCudaError(err, file, line);
-  }
-  {
-    auto err = cudaGetLastError();
-    ThrowOnCudaError(err, file, line);
-  }
-}
-
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
 // Use this where vector functionality (e.g. resize) is not required
 template <typename T>
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index ecca3f3267e4..3bd03a3b4a41 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -123,8 +123,13 @@ class GBLinear : public GradientBooster {
     this->updater_->SaveConfig(&j_updater);
   }
 
-  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry*,
+  void DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair, PredictionCacheEntry*,
                ObjFunction const*) override {
+    if (in_gpair->HasValueGrad()) {
+      LOG(FATAL)
+          << "Multi-target with reduced gradient is not implemented for the current booster.";
+    }
+
     monitor_.Start("DoBoost");
 
     CHECK(!p_fmat->Info().HasCategorical()) << error::NoCategorical("`gblinear`");
@@ -132,7 +137,7 @@ class GBLinear : public GradientBooster {
     this->LazySumWeights(p_fmat);
 
     if (!this->CheckConvergence()) {
-      updater_->Update(in_gpair, p_fmat, &model_, sum_instance_weight_);
+      updater_->Update(in_gpair->Grad(), p_fmat, &model_, sum_instance_weight_);
     }
     model_.num_boosted_rounds++;
     monitor_.Stop("DoBoost");
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 05f4fd0d14ff..3239742452f1 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -210,8 +210,8 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
   }
 }
 
-void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
-                     PredictionCacheEntry* predt, ObjFunction const* obj) {
+void GBTree::DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair, PredictionCacheEntry* predt,
+                     ObjFunction const* obj) {
   if (model_.learner_model_param->IsVectorLeaf()) {
     CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
         << "Only the hist tree method is supported for building multi-target trees with vector "
@@ -243,6 +243,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   std::vector<HostDeviceVector<bst_node_t>> node_position;
 
   if (model_.learner_model_param->IsVectorLeaf()) {
+    // Multi-target, vector leaf
     TreesOneGroup ret;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@@ -253,6 +254,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
       predt->Update(1);
     }
   } else if (model_.learner_model_param->OutputLength() == 1u) {
+    // Single target
     TreesOneGroup ret;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@@ -263,13 +265,16 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
       predt->Update(1);
     }
   } else {
-    CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
-    linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
-                                     ctx_->Device()};
+    // Multi-target, scalar leaf
+    CHECK_EQ(in_gpair->gpair.Size() % n_groups, 0U)
+        << "Must have exactly n_groups * n_samples gpairs.";
+    GradientContainer tmp;
+    tmp.gpair = linalg::Matrix<GradientPair>{
+        {in_gpair->gpair.Shape(0), static_cast<std::size_t>(1ul)}, ctx_->Device()};
     bool update_predict = true;
     for (bst_target_t gid = 0; gid < n_groups; ++gid) {
       node_position.clear();
-      CopyGradient(ctx_, in_gpair, gid, &tmp);
+      CopyGradient(ctx_, &in_gpair->gpair, gid, &tmp.gpair);
       TreesOneGroup ret;
       BoostNewTrees(&tmp, p_fmat, gid, &node_position, &ret);
       UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, node_position, &ret);
@@ -290,9 +295,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   this->CommitModel(std::move(new_trees));
 }
 
-void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
-                           std::vector<HostDeviceVector<bst_node_t>>* out_position,
-                           TreesOneGroup* ret) {
+std::vector<RegTree*> GBTree::InitNewTrees(bst_target_t bst_group, TreesOneGroup* ret) {
   std::vector<RegTree*> new_trees;
   ret->clear();
   // create the trees
@@ -326,20 +329,30 @@ void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
       ret->push_back(std::move(t));
     }
   }
+  return new_trees;
+}
+
+void GBTree::BoostNewTrees(GradientContainer* gpair, DMatrix* p_fmat, int bst_group,
+                           std::vector<HostDeviceVector<bst_node_t>>* out_position,
+                           TreesOneGroup* ret) {
+  std::vector<RegTree*> new_trees = this->InitNewTrees(bst_group, ret);
 
   // update the trees
   auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
   StringView msg{
       "Mismatching size between number of rows from input data and size of gradient vector."};
   if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
-    CHECK_EQ(n_out % gpair->Size(), 0) << msg;
-  } else {
-    CHECK_EQ(gpair->Size(), n_out) << msg;
+    CHECK_EQ(n_out % gpair->gpair.Size(), 0) << msg;
+  } else if (model_.learner_model_param->IsVectorLeaf()) {
+    // vector leaf
+    if (!gpair->HasValueGrad()) {
+      CHECK_EQ(gpair->gpair.Size(), n_out) << msg;
+    }
   }
 
   out_position->resize(new_trees.size());
 
-  // Rescale learning rate according to the size of trees
+  // Rescale learning rate according to the number of trees
   auto lr = tree_param_.learning_rate;
   tree_param_.learning_rate /= static_cast<float>(new_trees.size());
   for (auto& up : updaters_) {
@@ -1005,7 +1018,7 @@ DMLC_REGISTER_PARAMETER(DartTrainParam);
 XGBOOST_REGISTER_GBM(GBTree, "gbtree")
     .describe("Tree booster, gradient boosted trees.")
     .set_body([](LearnerModelParam const* booster_config, Context const* ctx) {
-      auto* p = new GBTree(booster_config, ctx);
+      auto* p = new GBTree{booster_config, ctx};
       return p;
     });
 XGBOOST_REGISTER_GBM(Dart, "dart")
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 46b54ce2c3f2..20975b610d5e 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -188,7 +188,7 @@ class GBTree : public GradientBooster {
   /**
    * @brief Carry out one iteration of boosting.
    */
-  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry* predt,
+  void DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair, PredictionCacheEntry* predt,
                ObjFunction const* obj) override;
 
   [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
@@ -326,10 +326,12 @@ class GBTree : public GradientBooster {
   }
 
  protected:
-  void BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+  void BoostNewTrees(GradientContainer* gpair, DMatrix* p_fmat, int bst_group,
                      std::vector<HostDeviceVector<bst_node_t>>* out_position,
                      std::vector<std::unique_ptr<RegTree>>* ret);
 
+  std::vector<RegTree*> InitNewTrees(bst_target_t bst_group, TreesOneGroup* ret);
+
   [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
       bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
       DMatrix* f_dmat = nullptr) const;
diff --git a/src/learner.cc b/src/learner.cc
index 1424ac471e18..5a10eda78832 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1142,17 +1142,17 @@ class LearnerImpl : public LearnerIO {
     monitor_.Stop("PredictRaw");
 
     monitor_.Start("GetGradient");
-    GetGradient(predt->predictions, train->Info(), iter, &gpair_);
+    GetGradient(predt->predictions, train->Info(), iter, &gpair_.gpair);
     monitor_.Stop("GetGradient");
-    TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients");
+    TrainingObserver::Instance().Observe(gpair_.Grad()->Data(), "Gradients");
 
     gbm_->DoBoost(train.get(), &gpair_, predt.get(), obj_.get());
     monitor_.Stop("UpdateOneIter");
   }
 
-  void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
-                    linalg::Matrix<GradientPair>* in_gpair) override {
-    monitor_.Start("BoostOneIter");
+  void BoostOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train,
+                    GradientContainer* in_gpair) override {
+    this->monitor_.Start(__func__);
     this->Configure();
 
     if (ctx_.seed_per_iteration) {
@@ -1160,13 +1160,17 @@ class LearnerImpl : public LearnerIO {
     }
 
     this->ValidateDMatrix(train.get(), true);
-
-    CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
-        << "The number of columns in gradient should be equal to the number of targets/classes in "
-           "the model.";
+    if (in_gpair->HasValueGrad()) {
+      CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->NumTargets())
+          << "Value gradient should have the same number of targets as the overall model.";
+    } else {
+      CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->NumSplitTargets())
+          << "The number of columns in gradient should be equal to the number of "
+             "targets/classes in the model.";
+    }
     auto predt = prediction_container_.Cache(train, ctx_.Device());
-    gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());
-    monitor_.Stop("BoostOneIter");
+    this->gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());
+    this->monitor_.Stop(__func__);
   }
 
   std::string EvalOneIter(int iter,
@@ -1338,7 +1342,7 @@ class LearnerImpl : public LearnerIO {
   /*! \brief random number transformation seed. */
   static int32_t constexpr kRandSeedMagic = 127;
   // gradient pairs
-  linalg::Matrix<GradientPair> gpair_;
+  GradientContainer gpair_;
   /*! \brief Temporary storage to prediction.  Useful for storing data transformed by
    *  objective function */
   PredictionContainer output_predictions_;
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index 8e4060bea7b7..3f8fe4f2a17e 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -80,7 +80,7 @@ void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
 }
 
 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                        bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                         HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   auto& tree = *p_tree;
 
@@ -163,7 +163,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
 }
 
 #if !defined(XGBOOST_USE_CUDA)
-void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, std::int32_t,
+void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, bst_target_t,
                           MetaInfo const&, float, HostDeviceVector<float> const&, float, RegTree*) {
   common::AssertGPUSupport();
 }
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 4b404259d485..81ebbcb6b9a5 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -144,7 +144,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 }
 
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                          bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   dh::device_vector<size_t> ridx;
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index 1a7aef0516d1..5f0b1c8ad11c 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -87,16 +87,16 @@ inline std::size_t IdxY(MetaInfo const& info, bst_group_t group_idx) {
 }
 
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                          bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 
 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                        bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                         HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 }  // namespace detail
 
 inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
-                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                           bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                            HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   if (ctx->IsCUDA()) {
     position.SetDevice(ctx->Device());
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 34a82e808310..05b4627ea85d 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -167,7 +167,7 @@ class QuantileRegression : public ObjFunction {
 
   void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
                       float learning_rate, HostDeviceVector<float> const& prediction,
-                      std::int32_t group_idx, RegTree* p_tree) const override {
+                      bst_target_t group_idx, RegTree* p_tree) const override {
     auto alpha = param_.quantile_alpha[group_idx];
     ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction,
                                    alpha, p_tree);
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index aa071c19cbc0..26b660d31554 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -784,7 +784,7 @@ class MeanAbsoluteError : public ObjFunction {
 
   void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
                       float learning_rate, HostDeviceVector<float> const& prediction,
-                      std::int32_t group_idx, RegTree* p_tree) const override {
+                      bst_target_t group_idx, RegTree* p_tree) const override {
     ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
                                    p_tree);
   }
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 8402bfbd8f0f..07c00c23b6b6 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -330,6 +330,7 @@ __global__ __launch_bounds__(kBlockThreads) void MultiHistKernel(
       }
       bst_target_t n_targets = roundings.size();
       compressed_bin *= n_targets;
+      // TODO(jiamingy): Assign a thread for each target.
       for (bst_target_t t = 0; t < n_targets; ++t) {
         auto adjusted = roundings[t].ToFixedPoint(d_gpair(ridx, t));
         AtomicAddGpairGlobal(d_node_hist + compressed_bin + t, adjusted);
diff --git a/src/tree/gpu_hist/leaf_sum.cu b/src/tree/gpu_hist/leaf_sum.cu
new file mode 100644
index 000000000000..7c57b9ae8731
--- /dev/null
+++ b/src/tree/gpu_hist/leaf_sum.cu
@@ -0,0 +1,76 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <cstddef>  // for size_t
+#include <vector>   // for vector
+
+#include "../../common/linalg_op.cuh"  // for tbegin
+#include "../updater_gpu_common.cuh"   // for GPUTrainingParam
+#include "leaf_sum.cuh"
+#include "quantiser.cuh"        // for GradientQuantiser
+#include "row_partitioner.cuh"  // for RowIndexT, LeafInfo
+#include "xgboost/base.h"       // for GradientPairInt64
+#include "xgboost/context.h"    // for Context
+#include "xgboost/linalg.h"     // for MatrixView
+#include "xgboost/span.h"       // for Span
+
+namespace xgboost::tree::cuda_impl {
+void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
+                 common::Span<GradientQuantiser const> roundings,
+                 common::Span<RowIndexT const> sorted_ridx,
+                 linalg::MatrixView<GradientPair const> grad,
+                 linalg::MatrixView<GradientPairInt64> out_sum) {
+  CHECK_EQ(h_leaves.size(), out_sum.Shape(0));
+
+  dh::device_vector<LeafInfo> leaves(h_leaves);
+  auto d_leaves = dh::ToSpan(leaves);
+
+  std::vector<RowIndexT> h_indptr{0};
+  for (auto const& node : h_leaves) {
+    h_indptr.push_back(node.node.segment.Size());
+  }
+  // leaves form a complete partition
+  dh::device_vector<RowIndexT> indptr{h_indptr};
+  thrust::inclusive_scan(ctx->CUDACtx()->CTP(), indptr.cbegin(), indptr.cend(), indptr.begin());
+  CHECK_EQ(roundings.size(), grad.Shape(1));
+  CHECK_EQ(roundings.size(), out_sum.Shape(1));
+  CHECK_EQ(out_sum.Shape(0), indptr.size() - 1);
+  CHECK_EQ(indptr.size(), h_leaves.size() + 1);
+  auto d_indptr = dh::ToSpan(indptr);
+
+  for (bst_target_t t = 0, n_targets = grad.Shape(1); t < n_targets; ++t) {
+    auto out_t = out_sum.Slice(linalg::All(), t);  // len == n_leaves
+    auto it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
+      auto nidx_in_set = dh::SegmentId(d_indptr, i);
+      // Index within segment
+      auto k = i - d_indptr[nidx_in_set];
+      // Global index (within a batch).
+      auto j = d_leaves[nidx_in_set].node.segment.begin + k;
+      // gradient
+      auto g = grad(sorted_ridx[j], t);
+      return roundings[t].ToFixedPoint(g);
+    });
+    std::size_t n_bytes = 0;
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, it, linalg::tbegin(out_t),
+                                                  h_leaves.size(), indptr.data(), indptr.data() + 1,
+                                                  ctx->CUDACtx()->Stream()));
+    dh::TemporaryArray<char> alloc(n_bytes);
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
+        alloc.data().get(), n_bytes, it, linalg::tbegin(out_t), h_leaves.size(), indptr.data(),
+        indptr.data() + 1, ctx->CUDACtx()->Stream()));
+  }
+}
+
+void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
+                common::Span<GradientQuantiser const> roundings,
+                linalg::MatrixView<GradientPairInt64 const> grad_sum,
+                linalg::MatrixView<float> out_weights) {
+  CHECK(grad_sum.Contiguous());
+  auto s_grad_sum = grad_sum.Values();
+  dh::LaunchN(grad_sum.Size(), ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t i) mutable {
+    auto [nidx_in_set, t] = linalg::UnravelIndex(i, grad_sum.Shape());
+    auto g = roundings[t].ToFloatingPoint(grad_sum(nidx_in_set, t));
+    out_weights(nidx_in_set, t) = CalcWeight(param, g.GetGrad(), g.GetHess());
+  });
+}
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/leaf_sum.cuh b/src/tree/gpu_hist/leaf_sum.cuh
new file mode 100644
index 000000000000..18c305a89e70
--- /dev/null
+++ b/src/tree/gpu_hist/leaf_sum.cuh
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#pragma once
+
+#include <vector>  // for vector
+
+#include "../updater_gpu_common.cuh"  // for GPUTrainingParam
+#include "quantiser.cuh"              // for GradientQuantiser
+#include "row_partitioner.cuh"        // for RowIndexT, LeafInfo
+#include "xgboost/context.h"          // for Context
+#include "xgboost/linalg.h"           // for MatrixView
+#include "xgboost/span.h"             // for Span
+
+namespace xgboost::tree::cuda_impl {
+/**
+ * @brief Calculate gradient sum for leaf nodes based on row partitions.
+ *
+ *   shape(out_sum) == (n_leaves, n_targets)
+ */
+void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
+                 common::Span<GradientQuantiser const> roundings,
+                 common::Span<RowIndexT const> sorted_ridx,
+                 linalg::MatrixView<GradientPair const> grad,
+                 linalg::MatrixView<GradientPairInt64> out_sum);
+
+/**
+ * @brief Calculate leaf weights from gradient sum.
+ *
+ *   shape(grad_sum) == (n_leaves, n_targets)
+ *   shape(out_weights) == (n_leaves, n_targets)
+ */
+void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
+                common::Span<GradientQuantiser const> roundings,
+                linalg::MatrixView<GradientPairInt64 const> grad_sum,
+                linalg::MatrixView<float> out_weights);
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index a6a9e3b24b72..457c4d6876a3 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -1,6 +1,8 @@
 /**
  * Copyright 2025, XGBoost contributors
  */
+#include <thrust/reduce.h>  // for reduce_by_key
+
 #include <cub/block/block_scan.cuh>  // for BlockScan
 #include <cub/util_type.cuh>         // for KeyValuePair
 #include <cub/warp/warp_reduce.cuh>  // for WarpReduce
@@ -232,7 +234,6 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
                                         MultiEvaluateSplitSharedInputs const &shared_inputs,
                                         common::Span<MultiExpandEntry> out_splits) {
   auto n_targets = shared_inputs.Targets();
-  CHECK_GE(n_targets, 2);
   auto n_bins_per_feat_tar = shared_inputs.n_bins_per_feat_tar;
   CHECK_GE(n_bins_per_feat_tar, 1);
   auto n_features = shared_inputs.Features();
@@ -274,6 +275,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
       dh::ToSpan(d_splits));
 
   // Find best split for each node
+  // * 3 because of base, left, right weights.
   this->weights_.resize(n_nodes * n_targets * 3);
   auto d_weights = dh::ToSpan(this->weights_);
 
@@ -285,18 +287,23 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
   auto s_d_splits = dh::ToSpan(d_splits);
 
   // Process results for each node
+  // Find best splits among all features for all nodes
+  auto key_it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
+    // Returns nidx_in_set
+    return i / n_features;
+  });
+  dh::device_vector<MultiSplitCandidate> best_splits(out_splits.size());
+  thrust::reduce_by_key(
+      ctx->CUDACtx()->CTP(), key_it, key_it + s_d_splits.size(), dh::tcbegin(s_d_splits),
+      thrust::make_discard_iterator(), best_splits.begin(), std::equal_to{},
+      [=] XGBOOST_DEVICE(MultiSplitCandidate const &lhs, MultiSplitCandidate const &rhs) {
+        return lhs.loss_chg > rhs.loss_chg ? lhs : rhs;
+      });
+  auto d_best_splits = dh::ToSpan(best_splits);
+
   dh::LaunchN(n_nodes, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t nidx_in_set) {
     auto input = d_inputs[nidx_in_set];
-
-    // Find best split among all features for this node
-    MultiSplitCandidate best_split{};
-    for (bst_feature_t f = 0; f < n_features; ++f) {
-      auto candidate = s_d_splits[nidx_in_set * n_features + f];
-      if (candidate.loss_chg > best_split.loss_chg) {
-        best_split = candidate;
-      }
-    }
-
+    MultiSplitCandidate best_split = d_best_splits[nidx_in_set];
     if (best_split.node_sum.empty()) {
       // Invalid split
       out_splits[nidx_in_set] = {};
@@ -362,34 +369,32 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
 
 void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tree,
                                         MultiExpandEntry const &candidate) {
-  auto n_targets = p_tree->NumTargets();
-
   auto left_child = p_tree->LeftChild(candidate.nidx);
   auto right_child = p_tree->RightChild(candidate.nidx);
   bst_node_t max_node = std::max(left_child, right_child);
+  auto n_targets = candidate.base_weight.size();
+
   this->AllocNodeSum(max_node, n_targets);
 
   auto parent_sum = this->GetNodeSum(candidate.nidx, n_targets);
-
   auto left_sum = this->GetNodeSum(left_child, n_targets);
   auto right_sum = this->GetNodeSum(right_child, n_targets);
 
   // Calculate node sums
-  // TODO(jiamingy): We need to batch the targets and nodes
+  // TODO(jiamingy): We need to batch the nodes
   auto best_split = candidate.split;
+
   auto node_sum = best_split.node_sum;
-  dh::LaunchN(1, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t) {
-    for (bst_target_t t = 0; t < n_targets; ++t) {
-      auto sibling_sum = parent_sum[t] - node_sum[t];
-      if (best_split.dir == kRightDir) {
-        // forward pass, node_sum is the left sum
-        left_sum[t] = node_sum[t];
-        right_sum[t] = sibling_sum;
-      } else {
-        // backward pass, node_sum is the right sum
-        right_sum[t] = node_sum[t];
-        left_sum[t] = sibling_sum;
-      }
+  dh::LaunchN(n_targets, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t t) {
+    auto sibling_sum = parent_sum[t] - node_sum[t];
+    if (best_split.dir == kRightDir) {
+      // forward pass, node_sum is the left sum
+      left_sum[t] = node_sum[t];
+      right_sum[t] = sibling_sum;
+    } else {
+      // backward pass, node_sum is the right sum
+      right_sum[t] = node_sum[t];
+      left_sum[t] = sibling_sum;
     }
   });
 }
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh
index 3df0b1f736b5..d3c369e4276a 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cuh
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cuh
@@ -10,6 +10,7 @@
 #include "xgboost/context.h"               // for Context
 
 namespace xgboost::tree::cuda_impl {
+/** @brief Evaluator for vector leaf. */
 class MultiHistEvaluator {
   dh::device_vector<float> weights_;
 
@@ -17,10 +18,15 @@ class MultiHistEvaluator {
   dh::device_vector<GradientPairInt64> node_sums_;
 
  public:
+  /**
+   * @brief Run evaluation for the root node.
+   */
   [[nodiscard]] MultiExpandEntry EvaluateSingleSplit(
       Context const *ctx, MultiEvaluateSplitInputs const &input,
       MultiEvaluateSplitSharedInputs const &shared_inputs);
-
+  /**
+   * @brief Run evaluation for multiple nodes.
+   */
   void EvaluateSplits(Context const *ctx, common::Span<MultiEvaluateSplitInputs const> d_inputs,
                       MultiEvaluateSplitSharedInputs const &shared_inputs,
                       common::Span<MultiExpandEntry> out_splits);
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 27eb040afa1e..5e0dc68661d6 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -205,7 +205,12 @@ struct NodePositionInfo {
   Segment segment;
   bst_node_t left_child = -1;
   bst_node_t right_child = -1;
-  __device__ bool IsLeaf() { return left_child == -1; }
+  [[nodiscard]] XGBOOST_DEVICE bool IsLeaf() const { return left_child == -1; }
+};
+
+struct LeafInfo {
+  bst_node_t nidx;
+  NodePositionInfo node;
 };
 
 XGBOOST_DEV_INLINE int GetPositionFromSegments(std::size_t idx,
@@ -304,10 +309,22 @@ class RowPartitioner {
   [[nodiscard]] bst_node_t GetNumNodes() const { return n_nodes_; }
 
   /**
-   * \brief Convenience method for testing
+   * @brief Convenience method for testing.
    */
   std::vector<RowIndexT> GetRowsHost(bst_node_t nidx);
 
+  [[nodiscard]] std::vector<LeafInfo> GetLeaves() const {
+    std::vector<LeafInfo> leaves;
+    bst_node_t nidx = 0;
+    for (auto const& node : this->ridx_segments_) {
+      if (node.IsLeaf()) {
+        leaves.emplace_back(LeafInfo{nidx, node});
+      }
+      nidx += 1;
+    }
+    return leaves;
+  }
+
   /**
    * \brief Updates the tree position for set of training instances being split
    * into left and right child nodes. Accepts a user-defined lambda specifying
diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc
index eeae410803c7..c34cc82c42aa 100644
--- a/src/tree/multi_target_tree_model.cc
+++ b/src/tree/multi_target_tree_model.cc
@@ -48,13 +48,12 @@ MultiTargetTree::MultiTargetTree(MultiTargetTree const& that)
   this->weights_.Copy(that.weights_);
 }
 
-void MultiTargetTree::SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
-  CHECK(this->IsLeaf(nidx)) << "Collapsing a split node to leaf " << MTNotImplemented();
-  auto const next_nidx = nidx + 1;
-  CHECK_EQ(weight.Size(), this->NumTargets());
+void MultiTargetTree::SetRoot(linalg::VectorView<float const> weight) {
+  auto const next_nidx = RegTree::kRoot + 1;
+  CHECK_LE(weight.Size(), this->NumTargets());
   CHECK_GE(weights_.Size(), next_nidx * weight.Size());
-  auto out_weight = weights_.HostSpan().subspan(nidx * weight.Size(), weight.Size());
-  for (std::size_t i = 0; i < weight.Size(); ++i) {
+  auto out_weight = weights_.HostSpan().subspan(RegTree::kRoot * weight.Size(), weight.Size());
+  for (std::size_t i = 0, n = weight.Size(); i < n; ++i) {
     out_weight[i] = weight(i);
   }
 }
@@ -100,19 +99,35 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl
 
   weights_.Resize(n * this->NumTargets());
   auto p_weight = this->NodeWeight(nidx);
-  CHECK_EQ(p_weight.Size(), base_weight.Size());
+  CHECK_GE(p_weight.Size(), base_weight.Size());
   auto l_weight = this->NodeWeight(left_child);
-  CHECK_EQ(l_weight.Size(), left_weight.Size());
+  CHECK_GE(l_weight.Size(), left_weight.Size());
   auto r_weight = this->NodeWeight(right_child);
-  CHECK_EQ(r_weight.Size(), right_weight.Size());
+  CHECK_GE(r_weight.Size(), right_weight.Size());
 
-  for (std::size_t i = 0; i < base_weight.Size(); ++i) {
+  CHECK_EQ(base_weight.Size(), left_weight.Size());
+  CHECK_EQ(base_weight.Size(), right_weight.Size());
+
+  for (std::size_t i = 0, n = base_weight.Size(); i < n; ++i) {
     p_weight(i) = base_weight(i);
     l_weight(i) = left_weight(i);
     r_weight(i) = right_weight(i);
   }
 }
 
+void MultiTargetTree::SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights) {
+  auto n_targets = this->NumTargets();
+  auto h_weights = this->weights_.HostSpan();
+  std::int32_t nidx_in_set = 0;
+  for (auto nidx : leaves) {
+    CHECK(this->IsLeaf(nidx));
+    auto w_in = weights.subspan(nidx_in_set * n_targets, n_targets);
+    auto w_out = h_weights.subspan(nidx * n_targets, n_targets);
+    std::copy(w_in.cbegin(), w_in.cend(), w_out.begin());
+    nidx_in_set++;
+  }
+}
+
 template <bool typed, bool feature_is_64>
 void LoadModelImpl(Json const& in, HostDeviceVector<float>* p_weights,
                    HostDeviceVector<bst_node_t>* p_lefts, HostDeviceVector<bst_node_t>* p_rights,
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index e6d83e181308..6e8176fce21e 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -889,6 +889,11 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
   this->param_.num_nodes = this->p_mt_tree_->Size();
 }
 
+void RegTree::SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights) {
+  CHECK(IsMultiTarget());
+  this->p_mt_tree_->SetLeaves(std::move(leaves), weights);
+}
+
 void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
                                 common::Span<common::KCatBitField::value_type> split_cat,
                                 bool default_left, bst_float base_weight,
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index dadcc94a1a24..91560a2564a6 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  *
  * \brief Implementation for the approx tree method.
  */
@@ -21,13 +21,14 @@
 #include "driver.h"                          // for Driver
 #include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
 #include "hist/expand_entry.h"               // for CPUExpandEntry
-#include "hist/histogram.h"                  // for MultiHistogramBuilder
 #include "hist/hist_param.h"                 // for HistMakerTrainParam
+#include "hist/histogram.h"                  // for MultiHistogramBuilder
 #include "hist/sampler.h"                    // for SampleGradient
 #include "param.h"                           // for GradStats, TrainParam
 #include "xgboost/base.h"                    // for Args, GradientPair, bst_node_t, bst_bin_t
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for DMatrix, BatchSet, BatchIterator, MetaInfo
+#include "xgboost/gradient.h"                // for GradientContainer
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
 #include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
 #include "xgboost/linalg.h"                  // for Matrix, MakeTensorView, Empty, MatrixView
@@ -284,7 +285,7 @@ class GlobalApproxUpdater : public TreeUpdater {
 
   [[nodiscard]] char const *Name() const override { return "grow_histmaker"; }
 
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *m,
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *m,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     CHECK(hist_param_.GetInitialised());
@@ -293,6 +294,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     }
     pimpl_ = std::make_unique<GlobalApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
                                                    column_sampler_, task_, &monitor_);
+    auto gpair = in_gpair->FullGradOnly();
 
     linalg::Matrix<GradientPair> h_gpair;
     // Obtain the hessian values for weighted sketching
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 807e1089ee8d..c12bb5ad7aaf 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2024, XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file updater_colmaker.cc
  * \brief use columnwise update to construct a tree
  * \author Tianqi Chen
@@ -14,7 +14,8 @@
 #include "param.h"
 #include "sample_position.h"  // for SamplePosition
 #include "split_evaluator.h"
-#include "tree_view.h"  // for ScalarTreeView
+#include "tree_view.h"         // for ScalarTreeView
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
 #include "xgboost/parameter.h"
@@ -94,7 +95,7 @@ class ColMaker: public TreeUpdater {
     }
   }
 
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *dmat,
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *dmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
     if (collective::IsDistributed()) {
@@ -115,6 +116,7 @@ class ColMaker: public TreeUpdater {
     // rescale learning rate according to size of trees
     interaction_constraints_.Configure(*param, dmat->Info().num_row_);
     // build tree
+    auto gpair = in_gpair->FullGradOnly();
     CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
     for (auto tree : trees) {
       CHECK(ctx_);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index be639d013cf1..416fac6e7166 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -5,11 +5,16 @@
 
 #include <algorithm>        // for max
 #include <cmath>            // for isnan
+#include <cstdint>          // for int32_t, uint32_t
 #include <cuda/functional>  // for plus
 #include <memory>           // for unique_ptr, make_unique
+#include <numeric>          // for partial_sum
+#include <string>           // for string
+#include <type_traits>      // for is_trivially_copyable_v
 #include <utility>          // for move
 #include <vector>           // for vector
 
+#include "../../src/collective/comm.h"  // for Op
 #include "../collective/aggregator.h"
 #include "../common/categorical.h"     // for KCatBitField
 #include "../common/cuda_context.cuh"  // for CUDAContext
@@ -30,6 +35,7 @@
 #include "gpu_hist/feature_groups.cuh"          // for FeatureGroups
 #include "gpu_hist/gradient_based_sampler.cuh"  // for GradientBasedSampler
 #include "gpu_hist/histogram.cuh"
+#include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
 #include "gpu_hist/row_partitioner.cuh"  // for RowPartitioner
 #include "hist/hist_param.h"             // for HistMakerTrainParam
 #include "param.h"                       // for TrainParam
@@ -38,10 +44,14 @@
 #include "updater_gpu_common.cuh"        // for HistBatch
 #include "updater_gpu_hist.cuh"          // for MultiTargetHistMaker
 #include "xgboost/base.h"                // for bst_idx_t
+#include "xgboost/collective/result.h"   // for Success, SafeColl
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for DMatrix
+#include "xgboost/gradient.h"            // for GradientContainer
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/json.h"                // for Json
+#include "xgboost/linalg.h"              // for MakeVec
+#include "xgboost/logging.h"             // for CHECK_EQ, CHECK_LE, CHECK_GE
 #include "xgboost/span.h"                // for Span
 #include "xgboost/task.h"                // for ObjInfo
 #include "xgboost/tree_model.h"          // for RegTree
@@ -847,19 +857,26 @@ class GPUHistMaker : public TreeUpdater {
 
   ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
 
-  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, GradientContainer* in_gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
-              const std::vector<RegTree*>& trees) override {
-    monitor_.Start(__func__);
+              std::vector<RegTree*> const& trees) override {
+    if (in_gpair->HasValueGrad() || in_gpair->gpair.Shape(1) > 1) {
+      CHECK(!this->task_->UpdateTreeLeaf()) << "Adaptive tree" << MTNotImplemented();
+    }
+    in_gpair->gpair.SetDevice(this->ctx_->Device());
 
     // build tree
     std::size_t t_idx{0};
-    for (xgboost::RegTree* tree : trees) {
-      this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
-      this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
+    for (xgboost::RegTree* p_tree : trees) {
+      this->InitData(param, p_fmat, p_tree);
+      if (p_tree->IsMultiTarget()) {
+        p_mtimpl_->UpdateTree(in_gpair, p_fmat, task_, p_tree);
+      } else {
+        p_scimpl_->UpdateTree(in_gpair->gpair.Data(), p_fmat, task_, p_tree, &out_position[t_idx]);
+      }
+      this->hist_maker_param_.CheckTreesSynchronized(ctx_, p_tree);
       ++t_idx;
     }
-    dh::safe_cuda(cudaGetLastError());
     monitor_.Stop(__func__);
   }
 
@@ -903,11 +920,8 @@ class GPUHistMaker : public TreeUpdater {
     this->InitData(param, p_fmat, p_tree);
     gpair->SetDevice(ctx_->Device());
     auto gpair_hdv = gpair->Data();
-    if (p_tree->IsMultiTarget()) {
-      p_mtimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
-    } else {
-      p_scimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
-    }
+    CHECK(!p_tree->IsMultiTarget());
+    p_scimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
   }
 
   bool UpdatePredictionCache(const DMatrix* data, linalg::MatrixView<float> p_out_preds) override {
@@ -972,12 +986,13 @@ class GPUGlobalApproxMaker : public TreeUpdater {
   }
   ~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
 
-  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, GradientContainer* in_gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     monitor_.Start(__func__);
 
     this->InitDataOnce(p_fmat);
+    auto gpair = in_gpair->FullGradOnly();
     // build tree
     hess_.resize(gpair->Size());
     auto hess = dh::ToSpan(hess_);
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index 47275b20fe6b..ed2139399919 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -11,12 +11,14 @@
 #include "driver.h"                            // for Driver
 #include "gpu_hist/feature_groups.cuh"         // for FeatureGroups
 #include "gpu_hist/histogram.cuh"              // for DeviceHistogramBuilder
+#include "gpu_hist/leaf_sum.cuh"               // for LeafGradSum
 #include "gpu_hist/multi_evaluate_splits.cuh"  // for MultiHistEvaluator
 #include "gpu_hist/row_partitioner.cuh"        // for RowPartitioner
 #include "hist/hist_param.h"                   // for HistMakerTrainParam
 #include "tree_view.h"                         // for MultiTargetTreeView
 #include "xgboost/base.h"                      // for bst_idx_t
 #include "xgboost/context.h"                   // for Context
+#include "xgboost/gradient.h"                  // for GradientContainer
 #include "xgboost/host_device_vector.h"        // for HostDeviceVector
 #include "xgboost/tree_model.h"                // for RegTree
 
@@ -49,21 +51,25 @@ class MultiTargetHistMaker {
   std::shared_ptr<common::HistogramCuts const> const cuts_;
   std::unique_ptr<FeatureGroups> feature_groups_;
   DeviceHistogramBuilder histogram_;
-  std::unique_ptr<MultiGradientQuantiser> quantiser_;
+  std::unique_ptr<MultiGradientQuantiser> split_quantizer_;
+  std::unique_ptr<MultiGradientQuantiser> value_quantizer_;
 
   MultiHistEvaluator evaluator_;
 
-  linalg::Matrix<GradientPair> dh_gpair_;
+  // Gradient used for building the tree structure
+  linalg::Matrix<GradientPair> split_gpair_;
+  // Gradient used for calculating the leaf values
+  linalg::Matrix<GradientPair> value_gpair_;
   std::vector<bst_idx_t> const batch_ptr_;
 
   dh::PinnedMemory pinned_;
 
   void BuildHist(EllpackPage const& page, std::int32_t k, bst_node_t nidx) {
-    auto d_gpair = this->dh_gpair_.View(this->ctx_->Device());
+    auto d_gpair = this->split_gpair_.View(this->ctx_->Device());
     CHECK(!this->partitioners_.empty());
     auto d_ridx = this->partitioners_.at(k)->GetRows(nidx);
     auto hist = histogram_.GetNodeHistogram(nidx);
-    auto roundings = this->quantiser_->Quantizers();
+    auto roundings = this->split_quantizer_->Quantizers();
     auto acc = page.Impl()->GetDeviceEllpack(this->ctx_, {});
     histogram_.BuildHistogram(this->ctx_->CUDACtx(), acc,
                               this->feature_groups_->DeviceAccessor(this->ctx_->Device()), d_gpair,
@@ -71,9 +77,9 @@ class MultiTargetHistMaker {
   }
 
  public:
-  void Reset(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, RegTree* p_tree) {
-    bst_idx_t n_targets = p_tree->NumTargets();
-    auto in_gpair = linalg::MakeTensorView(ctx_, gpair_all, p_fmat->Info().num_row_, n_targets);
+  void Reset(linalg::Matrix<GradientPair>* gpair_all, DMatrix* p_fmat) {
+    bst_idx_t n_targets = gpair_all->Shape(1);
+    auto in_gpair = gpair_all->View(ctx_->Device());
 
     /**
      * Initialize the partitioners
@@ -97,11 +103,16 @@ class MultiTargetHistMaker {
      * Initialize the histogram
      */
     std::size_t shape[2]{p_fmat->Info().num_row_, n_targets};
-    dh_gpair_ = linalg::Matrix<GradientPair>{shape, ctx_->Device(), linalg::kF};
-    TransposeGradient(this->ctx_, in_gpair, dh_gpair_.View(ctx_->Device()));
+    split_gpair_ = linalg::Matrix<GradientPair>{shape, ctx_->Device(), linalg::kF};
+    TransposeGradient(this->ctx_, in_gpair, split_gpair_.View(ctx_->Device()));
 
-    this->quantiser_ = std::make_unique<MultiGradientQuantiser>(
-        this->ctx_, dh_gpair_.View(ctx_->Device()), p_fmat->Info());
+    this->split_quantizer_ = std::make_unique<MultiGradientQuantiser>(
+        this->ctx_, split_gpair_.View(ctx_->Device()), p_fmat->Info());
+
+    if (!this->value_gpair_.Empty()) {
+      this->value_quantizer_ = std::make_unique<MultiGradientQuantiser>(
+          this->ctx_, value_gpair_.View(ctx_->Device()), p_fmat->Info());
+    }
 
     bool force_global = true;
     histogram_.Reset(this->ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()),
@@ -109,11 +120,11 @@ class MultiTargetHistMaker {
                      cuts_->TotalBins() * n_targets, force_global);
   }
 
-  [[nodiscard]] MultiExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree) {
-    auto d_gpair = dh_gpair_.View(ctx_->Device());
+  dh::device_vector<GradientPairInt64> CalcRootSum(
+      linalg::MatrixView<GradientPair> d_gpair,
+      common::Span<GradientQuantiser const> roundings) const {
     auto n_samples = d_gpair.Shape(0);
     auto n_targets = d_gpair.Shape(1);
-
     // Calculate the root sum
     dh::device_vector<GradientPairInt64> root_sum(n_targets);
 
@@ -121,16 +132,28 @@ class MultiTargetHistMaker {
       auto cidx = i / n_samples;
       return cidx;
     });
-    auto d_roundings = quantiser_->Quantizers();
     auto val_it =
         dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) -> GradientPairInt64 {
           auto cidx = i / n_samples;
           auto ridx = i % n_samples;
           auto g = d_gpair(ridx, cidx);
-          return d_roundings[cidx].ToFixedPoint(g);
+          return roundings[cidx].ToFixedPoint(g);
         });
     thrust::reduce_by_key(ctx_->CUDACtx()->CTP(), key_it, key_it + d_gpair.Size(), val_it,
                           thrust::make_discard_iterator(), root_sum.begin());
+    return root_sum;
+  }
+
+  [[nodiscard]] MultiExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree) {
+    auto d_gpair = split_gpair_.View(ctx_->Device());
+    auto n_targets = d_gpair.Shape(1);
+
+    // Calculate the root sum
+    auto root_sum = this->CalcRootSum(d_gpair, this->split_quantizer_->Quantizers());
+    this->evaluator_.AllocNodeSum(RegTree::kRoot, n_targets);
+    auto d_root_sum = this->evaluator_.GetNodeSum(RegTree::kRoot, n_targets);
+    dh::safe_cuda(cudaMemcpyAsync(d_root_sum.data(), root_sum.data().get(), d_root_sum.size_bytes(),
+                                  cudaMemcpyDefault, this->ctx_->CUDACtx()->Stream()));
 
     // Build the root histogram.
     histogram_.AllocateHistograms(ctx_, {RegTree::kRoot});
@@ -146,6 +169,7 @@ class MultiTargetHistMaker {
     auto node_hist = this->histogram_.GetNodeHistogram(RegTree::kRoot);
     MultiEvaluateSplitInputs input{RegTree::kRoot, p_tree->GetDepth(RegTree::kRoot),
                                    dh::ToSpan(root_sum), node_hist};
+    auto d_roundings = split_quantizer_->Quantizers();
     GPUTrainingParam param{this->param_};
     MultiEvaluateSplitSharedInputs shared_inputs{d_roundings,
                                                  this->cuts_->cut_ptrs_.ConstDeviceSpan(),
@@ -156,9 +180,12 @@ class MultiTargetHistMaker {
     auto entry = this->evaluator_.EvaluateSingleSplit(ctx_, input, shared_inputs);
 
     // TODO(jiamingy): Support learning rate.
+    // TODO(jiamingy): We need to modify the tree structure to account for internal reduced weight
+    // size.
     std::vector<float> h_base_weight(entry.base_weight.size());
     dh::CopyDeviceSpanToVector(&h_base_weight, entry.base_weight);
-    p_tree->SetLeaf(RegTree::kRoot, linalg::MakeVec(h_base_weight));
+    p_tree->SetRoot(linalg::MakeVec(h_base_weight));
+
     return entry;
   }
 
@@ -171,6 +198,7 @@ class MultiTargetHistMaker {
     dh::CopyDeviceSpanToVector(&h_base_weight, candidate.base_weight);
     dh::CopyDeviceSpanToVector(&h_left_weight, candidate.left_weight);
     dh::CopyDeviceSpanToVector(&h_right_weight, candidate.right_weight);
+
     p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
                        candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight),
                        linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight));
@@ -178,6 +206,28 @@ class MultiTargetHistMaker {
     this->evaluator_.ApplyTreeSplit(this->ctx_, p_tree, candidate);
   }
 
+  void UpdateTreeLeaf(linalg::Matrix<GradientPair> const& full_grad, RegTree* p_tree) const {
+    // TODO(jiamingy): Need to iterate through partitioners for external memory support.
+    CHECK_EQ(this->partitioners_.size(), 1);
+    auto leaves = this->partitioners_.front()->GetLeaves();
+    // Calculate the leaf weight based on the node sum for each leaf.
+    // Update the leaf weight, with learning rate.
+    linalg::Matrix<GradientPairInt64> out_sum(
+        {leaves.size(), static_cast<std::size_t>(p_tree->NumTargets())}, this->ctx_->Device());
+    LeafGradSum(this->ctx_, leaves, this->value_quantizer_->Quantizers(),
+                this->partitioners_.front()->GetRows(), full_grad.View(this->ctx_->Device()),
+                out_sum.View(this->ctx_->Device()));
+    auto param = GPUTrainingParam{this->param_};
+    auto out_weight = linalg::Empty<float>(this->ctx_, leaves.size(), p_tree->NumTargets());
+    // Use full value gradient for leaf values.
+    LeafWeight(this->ctx_, param, this->value_quantizer_->Quantizers(),
+               out_sum.View(this->ctx_->Device()), out_weight.View(this->ctx_->Device()));
+    std::vector<bst_node_t> leaves_idx(leaves.size());
+    std::transform(leaves.begin(), leaves.end(), leaves_idx.begin(),
+                   [](LeafInfo const& leaf) { return leaf.nidx; });
+    p_tree->SetLeaves(leaves_idx, out_weight.Data()->ConstHostSpan());
+  }
+
   struct NodeSplitData {
     bst_node_t nidx;
   };
@@ -258,9 +308,9 @@ class MultiTargetHistMaker {
     }
 
     histogram_.AllocateHistograms(ctx_, build_nidx);
-    // Use a device view.
-    mt_tree = MultiTargetTreeView{this->ctx_->Device(), p_tree};
 
+    // Pull to device
+    mt_tree = MultiTargetTreeView{this->ctx_->Device(), p_tree};
     std::int32_t k{0};
     // TODO(jiamingy): Support external memory.
     bool prefetch_copy = true;
@@ -288,7 +338,7 @@ class MultiTargetHistMaker {
     }
     GPUTrainingParam param{this->param_};
     MultiEvaluateSplitSharedInputs shared_inputs{
-        this->quantiser_->Quantizers(),
+        this->split_quantizer_->Quantizers(),
         this->cuts_->cut_ptrs_.ConstDeviceSpan(),
         this->cuts_->cut_values_.ConstDeviceSpan(),
         this->cuts_->min_vals_.ConstDeviceSpan(),
@@ -307,17 +357,18 @@ class MultiTargetHistMaker {
       bst_node_t right_nidx = mt_tree.RightChild(candidate.nidx);
       max_nidx = std::max({max_nidx, left_nidx, right_nidx});
     }
-
+    auto n_targets = this->split_gpair_.Shape(1);
     for (std::size_t i = 0; i < candidates.size(); i++) {
       auto candidate = candidates.at(i);
       bst_node_t left_nidx = mt_tree.LeftChild(candidate.nidx);
       bst_node_t right_nidx = mt_tree.RightChild(candidate.nidx);
       // Make sure no allocation is happening.
       // The parent sum is calculated in the last apply tree split.
-      auto parent_sum = this->evaluator_.GetNodeSum(candidate.nidx, mt_tree.NumTargets());
-      auto left = MultiEvaluateSplitInputs{left_nidx, candidate.depth + 1, parent_sum,
+      auto left = MultiEvaluateSplitInputs{left_nidx, candidate.depth + 1,
+                                           this->evaluator_.GetNodeSum(left_nidx, n_targets),
                                            histogram_.GetNodeHistogram(left_nidx)};
-      auto right = MultiEvaluateSplitInputs{right_nidx, candidate.depth + 1, parent_sum,
+      auto right = MultiEvaluateSplitInputs{right_nidx, candidate.depth + 1,
+                                            this->evaluator_.GetNodeSum(right_nidx, n_targets),
                                             histogram_.GetNodeHistogram(right_nidx)};
       h_node_inputs[i * 2] = left;
       h_node_inputs[i * 2 + 1] = right;
@@ -332,14 +383,30 @@ class MultiTargetHistMaker {
                                   ctx_->CUDACtx()->Stream()));
   }
 
-  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const*,
-                  RegTree* p_tree, HostDeviceVector<bst_node_t>*) {
+  void UpdateTree(GradientContainer* gpair, DMatrix* p_fmat, ObjInfo const* task, RegTree* p_tree) {
+    auto* split_grad = gpair->Grad();
+    if (gpair->HasValueGrad()) {
+      this->value_gpair_ =
+          linalg::Matrix<GradientPair>{gpair->value_gpair.Shape(), ctx_->Device(), linalg::kF};
+      TransposeGradient(this->ctx_, gpair->value_gpair.View(this->ctx_->Device()),
+                        value_gpair_.View(this->ctx_->Device()));
+    }
+
+    this->GrowTree(split_grad, p_fmat, task, p_tree);
+
+    if (gpair->HasValueGrad()) {
+      this->UpdateTreeLeaf(gpair->value_gpair, p_tree);
+    }
+  }
+
+  void GrowTree(linalg::Matrix<GradientPair>* split_gpair, DMatrix* p_fmat, ObjInfo const*,
+                RegTree* p_tree) {
     if (this->param_.learning_rate - 1.0 != 0.0) {
       LOG(FATAL) << "GPU" << MTNotImplemented();
     }
     Driver<MultiExpandEntry> driver{param_, kMaxNodeBatchSize};
 
-    this->Reset(gpair_all, p_fmat, p_tree);
+    this->Reset(split_gpair, p_fmat);
     driver.Push({this->InitRoot(p_fmat, p_tree)});
 
     // The set of leaves that can be expanded asynchronously
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index 2c2d1a2f0d93..2e95885b88d7 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file updater_prune.cc
  * \brief prune a tree given the statistics
  * \author Tianqi Chen
@@ -11,7 +11,9 @@
 #include "../common/timer.h"
 #include "./param.h"
 #include "xgboost/base.h"
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
+
 namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_prune);
 
@@ -31,14 +33,14 @@ class TreePruner : public TreeUpdater {
   [[nodiscard]] bool CanModifyTree() const override { return true; }
 
   // update the tree, do pruning
-  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, GradientContainer* in_gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     pruner_monitor_.Start("PrunerUpdate");
     for (auto tree : trees) {
       this->DoPrune(param, tree);
     }
-    syncher_->Update(param, gpair, p_fmat, out_position, trees);
+    syncher_->Update(param, in_gpair, p_fmat, out_position, trees);
     pruner_monitor_.Stop("PrunerUpdate");
   }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 25b54a37e7de..08541f01a435 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -33,6 +33,7 @@
 #include "xgboost/base.h"                    // for Args, GradientPairPrecise, GradientPair, Gra...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for BatchSet, DMatrix, BatchIterator, MetaInfo
+#include "xgboost/gradient.h"                // for GradientContainer
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
 #include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
 #include "xgboost/linalg.h"                  // for MatrixView, TensorView, All, Matrix, Empty
@@ -219,7 +220,7 @@ class MultiTargetHistBuilder {
     std::transform(linalg::cbegin(weight_t), linalg::cend(weight_t), linalg::begin(weight_t),
                    [&](float w) { return w * param_->learning_rate; });
 
-    p_tree->SetLeaf(RegTree::kRoot, weight_t);
+    p_tree->SetRoot(weight_t);
     std::vector<BoundedHistCollection const *> hists;
     std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
@@ -516,7 +517,7 @@ class QuantileHistMaker : public TreeUpdater {
 
   [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
 
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     if (!column_sampler_) {
@@ -539,7 +540,7 @@ class QuantileHistMaker : public TreeUpdater {
     }
 
     bst_target_t n_targets = trees.front()->NumTargets();
-    auto h_gpair = gpair->HostView();
+    auto h_gpair = in_gpair->FullGradOnly()->HostView();
 
     linalg::Matrix<GradientPair> sample_out;
     auto h_sample_out = h_gpair;
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index af1af2fb76f1..62639e3542a3 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -4,8 +4,6 @@
  * \brief refresh the statistics and leaf value on the tree on the dataset
  * \author Tianqi Chen
  */
-#include <xgboost/tree_updater.h>
-
 #include <limits>
 #include <vector>
 
@@ -14,7 +12,9 @@
 #include "../predictor/predict_fn.h"
 #include "../tree/tree_view.h"  // for ScalarTreeView
 #include "./param.h"
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
+#include "xgboost/tree_updater.h"
 
 namespace xgboost::tree {
 
@@ -30,13 +30,14 @@ class TreeRefresher : public TreeUpdater {
 
   [[nodiscard]] char const *Name() const override { return "refresh"; }
   [[nodiscard]] bool CanModifyTree() const override { return true; }
-  // update the tree, do pruning
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
+  // Update the tree, do pruning
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
     if (trees.size() == 0) {
       return;
     }
+    auto gpair = in_gpair->FullGradOnly();
     CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
     const std::vector<GradientPair> &gpair_h = gpair->Data()->ConstHostVector();
     // Thread local variables.
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index fd1eb943781e..17244bc2c6a7 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -9,6 +9,7 @@
 #include "../collective/broadcast.h"         // for Broadcast
 #include "../collective/communicator-inl.h"  // for GetRank, GetWorldSize
 #include "xgboost/context.h"                 // for Context
+#include "xgboost/gradient.h"                // for GradientContainer
 #include "xgboost/json.h"                    // for Json, Object
 #include "xgboost/linalg.h"                  // for Matrix
 #include "xgboost/tree_updater.h"            // for TreeUpdater
@@ -31,9 +32,9 @@ class TreeSyncher : public TreeUpdater {
 
   [[nodiscard]] char const* Name() const override { return "sync"; }
 
-  void Update(TrainParam const*, linalg::Matrix<GradientPair>*, DMatrix*,
+  void Update(TrainParam const*, GradientContainer*, DMatrix*,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
-              const std::vector<RegTree*>& trees) override {
+              std::vector<RegTree*> const& trees) override {
     if (collective::GetWorldSize() == 1) {
       return;
     }
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index abe0abccba8a..fde9bc3d602f 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -65,8 +65,8 @@ TEST(GBTree, PredictionCache) {
 
   gbtree.Configure({{"tree_method", "hist"}});
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
+  GradientContainer gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   PredictionCacheEntry out_predictions;
   gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);
@@ -207,9 +207,9 @@ TEST(GBTree, ChooseTreeMethod) {
       learner->SetParam("device", d);
     }
     learner->Configure();
+    Context ctx;
     for (std::int32_t i = 0; i < 3; ++i) {
-      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
-      gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
+      GradientContainer gpair = GenerateRandomGradients(&ctx, Xy->Info().num_row_, 1);
       learner->BoostOneIter(0, Xy, &gpair);
     }
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index dcf0a694c897..83e0c17c4819 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -666,8 +666,9 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
   }
   p_dmat->Info().labels =
       linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
-  auto h_gpair = gpair.HostView();
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>{{kRows}, ctx->Device()};
+  auto h_gpair = gpair.gpair.HostView();
   for (size_t i = 0; i < kRows; ++i) {
     h_gpair(i) = GradientPair{static_cast<float>(i), 1};
   }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index c5b9e0153135..04e0bf806cd2 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -394,13 +394,12 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
   return gpair;
 }
 
-inline linalg::Matrix<GradientPair> GenerateRandomGradients(Context const* ctx, bst_idx_t n_rows,
-                                                            bst_target_t n_targets,
-                                                            float lower = 0.0f,
-                                                            float upper = 1.0f) {
+inline auto GenerateRandomGradients(Context const* ctx, bst_idx_t n_rows, bst_target_t n_targets,
+                                    float lower = 0.0f, float upper = 1.0f) {
   auto g = GenerateRandomGradients(n_rows * n_targets, lower, upper);
-  linalg::Matrix<GradientPair> gpair({n_rows, static_cast<bst_idx_t>(n_targets)}, ctx->Device());
-  gpair.Data()->Copy(g);
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>{{n_rows, static_cast<bst_idx_t>(n_targets)}, ctx->Device()};
+  gpair.gpair.Data()->Copy(g);
   return gpair;
 }
 
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 66d3e312c76b..bda5d43d58e9 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -202,8 +202,9 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 
   auto dmat = RandomDataGenerator(kRows, kCols, 0).Classes(kClasses).GenerateDMatrix(true);
 
-  linalg::Matrix<GradientPair> gpair({kRows, kClasses}, ctx.Device());
-  auto h_gpair = gpair.HostView();
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>({kRows, kClasses}, ctx.Device());
+  auto h_gpair = gpair.gpair.HostView();
   for (size_t i = 0; i < kRows * kClasses; ++i) {
     std::apply(h_gpair, linalg::UnravelIndex(i, kRows, kClasses)) = {static_cast<float>(i), 1};
   }
diff --git a/tests/cpp/tree/gpu_hist/dummy_quantizer.cuh b/tests/cpp/tree/gpu_hist/dummy_quantizer.cuh
new file mode 100644
index 000000000000..62619634264f
--- /dev/null
+++ b/tests/cpp/tree/gpu_hist/dummy_quantizer.cuh
@@ -0,0 +1,22 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#pragma once
+
+#include <xgboost/base.h>  // for bst_target_t
+
+#include <vector>  // for vector
+
+#include "../../../../src/common/device_vector.cuh"     // for device_vector
+#include "../../../../src/tree/gpu_hist/quantiser.cuh"  // for GradientQuantiser
+
+namespace xgboost::tree {
+inline auto MakeDummyQuantizers(bst_target_t n_targets) {
+  std::vector<GradientQuantiser> h_quantizers;
+  for (bst_target_t i = 0; i < n_targets; ++i) {
+    h_quantizers.emplace_back(GradientPairPrecise{1.0f, 1.0f}, GradientPairPrecise{1.0f, 1.0f});
+  }
+  dh::device_vector<GradientQuantiser> d_quantizers(h_quantizers);
+  return d_quantizers;
+}
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_leaf_sum.cu b/tests/cpp/tree/gpu_hist/test_leaf_sum.cu
new file mode 100644
index 000000000000..2718741ce491
--- /dev/null
+++ b/tests/cpp/tree/gpu_hist/test_leaf_sum.cu
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <thrust/sequence.h>  // for sequence
+#include <xgboost/linalg.h>   // for Constant
+
+#include <vector>  // for vector
+
+#include "../../../../src/common/device_vector.cuh"
+#include "../../../../src/tree/gpu_hist/leaf_sum.cuh"
+#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"  // for LeafInfo
+#include "../../helpers.h"
+#include "dummy_quantizer.cuh"  // for MakeDummyQuantizers
+
+namespace xgboost::tree::cuda_impl {
+TEST(LeafGradSum, Basic) {
+  auto ctx = MakeCUDACtx(0);
+
+  bst_target_t n_targets = 2;
+  bst_idx_t n_samples = 6;
+  bst_idx_t n_leaves = 2;
+
+  // Create leaf information
+  std::vector<LeafInfo> h_leaves(n_leaves);
+  h_leaves[0].nidx = 1;
+  h_leaves[0].node.segment = Segment{0, 3};
+  h_leaves[1].nidx = 2;
+  h_leaves[1].node.segment = Segment{3, 6};
+
+  auto gpairs = linalg::Constant(&ctx, GradientPair{1.0f, 1.0f}, n_samples, n_targets);
+
+  dh::device_vector<RowIndexT> sorted_ridx(n_samples);
+  thrust::sequence(ctx.CUDACtx()->CTP(), sorted_ridx.begin(), sorted_ridx.end(), 0);
+
+  auto quantizers = MakeDummyQuantizers(n_targets);
+  auto out_sum = linalg::Empty<GradientPairInt64>(&ctx, n_leaves, n_targets);
+
+  LeafGradSum(&ctx, h_leaves, dh::ToSpan(quantizers), dh::ToSpan(sorted_ridx),
+              gpairs.View(ctx.Device()), out_sum.View(ctx.Device()));
+
+  for (auto v : out_sum.HostView()) {
+    ASSERT_EQ(v.GetQuantisedGrad(), 3);
+    ASSERT_EQ(v.GetQuantisedHess(), 3);
+  }
+}
+}  // namespace xgboost::tree::cuda_impl
diff --git a/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
index 6e08cc420a4e..f0b5083075a2 100644
--- a/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
@@ -6,6 +6,7 @@
 #include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
 #include "../../../../src/tree/gpu_hist/multi_evaluate_splits.cuh"
 #include "../../helpers.h"
+#include "dummy_quantizer.cuh"  // for MakeDummyQuantizers
 
 namespace xgboost::tree::cuda_impl {
 class GpuMultiHistEvaluatorBasicTest : public ::testing::Test {
@@ -45,8 +46,7 @@ class GpuMultiHistEvaluatorBasicTest : public ::testing::Test {
     input.parent_sum = dh::ToSpan(parent_sum);
     input.histogram = dh::ToSpan(histogram);
 
-    GradientQuantiser quantizer{{1.0, 1.0}, {1.0, 1.0}};
-    quantizers.resize(2, quantizer);
+    quantizers = MakeDummyQuantizers(2);
 
     shared_inputs.roundings = dh::ToSpan(quantizers);
 
diff --git a/tests/cpp/tree/gpu_hist/test_multi_histogram.cu b/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
index e59eb40818cc..f3911200f592 100644
--- a/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
@@ -7,6 +7,7 @@
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../helpers.h"
 #include "../../histogram_helpers.h"
+#include "dummy_quantizer.cuh"  // for MakeDummyQuantizers
 
 namespace xgboost::tree::cuda_impl {
 TEST(GpuMultiHistogram, Basic) {
@@ -27,17 +28,17 @@ TEST(GpuMultiHistogram, Basic) {
   bst_bin_t n_total_bins = n_targets * n_features * n_bins;
   histogram.Reset(&ctx, /*max_cached_hist_nodes=*/2, fg_acc, n_total_bins, true);
 
-  auto gpairs = linalg::Constant(&ctx, GradientPair{1, 1}, n_samples, n_targets);
+  auto gpairs = linalg::Constant(&ctx, GradientPair{1.0f, 1.0f}, n_samples, n_targets);
   dh::device_vector<std::uint32_t> ridx(n_samples);
   thrust::sequence(ctx.CUDACtx()->CTP(), ridx.begin(), ridx.end(), 0);
 
   histogram.AllocateHistograms(&ctx, {0});
   auto node_hist = histogram.GetNodeHistogram(0);
-  std::vector<GradientQuantiser> h_quantizers(n_targets, GradientQuantiser{{1.0, 1.0}, {1.0, 1.0}});
-  dh::device_vector<GradientQuantiser> d_quantizers{h_quantizers};
+  auto quantizers = MakeDummyQuantizers(n_targets);
+
   histogram.BuildHistogram(ctx.CUDACtx(), page->GetDeviceEllpack(&ctx, {}), fg_acc,
                            gpairs.View(ctx.Device()), dh::ToSpan(ridx), node_hist,
-                           dh::ToSpan(d_quantizers));
+                           dh::ToSpan(quantizers));
 
   std::vector<GradientPairInt64> h_node_hist(node_hist.size());
   dh::CopyDeviceSpanToVector(&h_node_hist, node_hist);
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 367ec382e98f..a5858bb3e890 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -204,7 +204,7 @@ TEST(HistMultiEvaluator, Evaluate) {
 
   RegTree tree{n_targets, n_features};
   auto weight = evaluator.InitRoot(root_sum.HostView());
-  tree.SetLeaf(RegTree::kRoot, weight.HostView());
+  tree.SetRoot(weight.HostView());
   auto w = weight.HostView();
   ASSERT_EQ(w.Size(), n_targets);
   ASSERT_EQ(w(0), -1.5);
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 29d734c1fce1..a7623a658770 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -1,7 +1,9 @@
 /**
- * Copyright 2021-2024, XGBoost contributors.
+ * Copyright 2021-2025, XGBoost contributors.
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>      // for GradientContainer
+#include <xgboost/tree_model.h>    // for RegTree
 #include <xgboost/tree_updater.h>  // for TreeUpdater
 
 #include <algorithm>  // for transform
@@ -14,7 +16,6 @@
 #include "../helpers.h"
 #include "test_column_split.h"  // for TestColumnSplit
 #include "test_partitioner.h"
-#include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost::tree {
 namespace {
@@ -89,8 +90,7 @@ TEST(Approx, InteractionConstraint) {
   auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.6f, false);
   Context ctx;
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  GradientContainer gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   ObjInfo task{ObjInfo::kRegression};
   {
diff --git a/tests/cpp/tree/test_gpu_approx.cu b/tests/cpp/tree/test_gpu_approx.cu
index 7df60b8cbcd2..295d79c0a477 100644
--- a/tests/cpp/tree/test_gpu_approx.cu
+++ b/tests/cpp/tree/test_gpu_approx.cu
@@ -1,7 +1,8 @@
 /**
- * Copyright 2024, XGBoost contributors
+ * Copyright 2024-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>      // for GradientContainer
 #include <xgboost/json.h>          // for Json
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_model.h>    // for RegTree
@@ -21,8 +22,7 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
 
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
-  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+  auto gpair = GenerateRandomGradients(ctx, dmat->Info().num_row_, 1);
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index d0f546e6134e..01e378172959 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -1,9 +1,10 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>                // for Args
 #include <xgboost/context.h>             // for Context
+#include <xgboost/gradient.h>            // for GradientContainer
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/json.h>                // for Json
 #include <xgboost/task.h>                // for ObjInfo
@@ -21,7 +22,7 @@
 
 namespace xgboost::tree {
 namespace {
-void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
+void UpdateTree(Context const* ctx, GradientContainer* gpair, DMatrix* dmat,
                 RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
                 const std::string& sampling_method, bst_bin_t max_bin, bool concat_pages) {
   Args args{
@@ -67,8 +68,7 @@ TEST(GpuHist, UniformSampling) {
   auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
   ASSERT_TRUE(p_fmat->SingleColBlock());
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -97,9 +97,7 @@ TEST(GpuHist, GradientBasedSampling) {
 
   // Create an in-memory DMatrix.
   auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
-
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -135,8 +133,7 @@ TEST(GpuHist, ExternalMemory) {
   ASSERT_TRUE(p_fmat->SingleColBlock());
 
   auto ctx = MakeCUDACtx(0);
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -177,8 +174,7 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
                         .GenerateSparsePageDMatrix("temp", true);
   ASSERT_FALSE(p_fmat_ext->SingleColBlock());
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
@@ -276,9 +272,7 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
-
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
-  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+  auto gpair = GenerateRandomGradients(ctx, dmat->Info().num_row_, 1);
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index 1a3ec532e18b..0b8e3258c9cd 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -1,9 +1,10 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
-#include <xgboost/host_device_vector.h>
+#include <xgboost/gradient.h>            // for GradientContainer
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/learner.h>
 #include <xgboost/tree_updater.h>
 
@@ -24,7 +25,8 @@ TEST(Updater, Prune) {
   Context ctx;
 
   // These data are just place holders.
-  linalg::Matrix<GradientPair> gpair
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>
       {{ {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
          {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} }, {8, 1}, ctx.Device()};
   std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{32, 10, 0}.GenerateDMatrix()};
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index d8e1e2c016ee..03763d179e87 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,7 +1,8 @@
 /**
- * Copyright 2018-2024, XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>  // for GradientContainer
 #include <xgboost/host_device_vector.h>
 #include <xgboost/linalg.h>
 #include <xgboost/tree_updater.h>
@@ -290,8 +291,9 @@ void TestPartitionerOverrun(bst_target_t n_targets) {
           "part_resize_big_first", true);
 
   std::size_t shape_large[2]{dmat_large->Info().num_row_, n_targets_size};
-  linalg::Matrix<GradientPair> gpair_large(shape_large, ctx.Device());
-  FillGradients(&gpair_large);
+  GradientContainer gpair_large;
+  gpair_large.gpair = linalg::Matrix<GradientPair>{shape_large, ctx.Device()};
+  FillGradients(&gpair_large.gpair);
 
   RegTree tree_large{n_targets, static_cast<bst_feature_t>(kCols)};
   std::vector<RegTree*> trees_large{&tree_large};
@@ -318,8 +320,9 @@ void TestPartitionerOverrun(bst_target_t n_targets) {
   std::memcpy(tail_before.data(), hv.data() + hv.size(), tail_elems * sizeof(bst_node_t));
 
   std::size_t shape_small[2]{dmat_small->Info().num_row_, n_targets_size};
-  linalg::Matrix<GradientPair> gpair_small(shape_small, ctx.Device());
-  FillGradients(&gpair_small);
+  GradientContainer gpair_small;
+  gpair_small.gpair = linalg::Matrix<GradientPair>{shape_small, ctx.Device()};
+  FillGradients(&gpair_small.gpair);
 
   RegTree tree_small{n_targets, static_cast<bst_feature_t>(kCols)};
   std::vector<RegTree*> trees_small{&tree_small};
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index bbd274a08d0f..01052861ab03 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -1,7 +1,8 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>  // for GradientContainer
 #include <xgboost/host_device_vector.h>
 #include <xgboost/task.h>  // for ObjInfo
 #include <xgboost/tree_updater.h>
@@ -19,9 +20,18 @@ TEST(Updater, Refresh) {
   bst_feature_t constexpr kCols = 16;
   Context ctx;
 
-  linalg::Matrix<GradientPair> gpair
-      {{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
-         {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }, {8, 1}, ctx.Device()};
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>{{{0.23f, 0.24f},
+                                              {0.23f, 0.24f},
+                                              {0.23f, 0.24f},
+                                              {0.23f, 0.24f},
+                                              {0.27f, 0.29f},
+                                              {0.27f, 0.29f},
+                                              {0.27f, 0.29f},
+                                              {0.27f, 0.29f}},
+                                             {8, 1},
+                                             ctx.Device()};
+
   std::shared_ptr<DMatrix> p_dmat{
     RandomDataGenerator{kRows, kCols, 0.4f}.Seed(3).GenerateDMatrix()};
   std::vector<std::pair<std::string, std::string>> cfg{
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index 920c3e6af99c..7654c04a2125 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -3,6 +3,7 @@
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>       // for Context
+#include <xgboost/gradient.h>      // for GradientContainer
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_model.h>    // for RegTree
 #include <xgboost/tree_updater.h>  // for TreeUpdater
@@ -21,16 +22,15 @@ namespace xgboost {
 class UpdaterTreeStatTest : public ::testing::Test {
  protected:
   std::shared_ptr<DMatrix> p_dmat_;
-  linalg::Matrix<GradientPair> gpairs_;
+  GradientContainer gpairs_;
   size_t constexpr static kRows = 10;
   size_t constexpr static kCols = 10;
 
  protected:
   void SetUp() override {
     p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true);
-    auto g = GenerateRandomGradients(kRows);
-    gpairs_.Reshape(kRows, 1);
-    gpairs_.Data()->Copy(g);
+    Context ctx;
+    gpairs_ = GenerateRandomGradients(&ctx, kRows, 1);
   }
 
   void RunTest(Context const* ctx, std::string updater) {
@@ -99,7 +99,7 @@ class TestSplitWithEta : public ::testing::Test {
       updater->Configure({});
 
       auto grad = GenerateRandomGradients(ctx, Xy->Info().num_row_, n_targets);
-      CHECK_EQ(grad.Shape(1), n_targets);
+      CHECK_EQ(grad.gpair.Shape(1), n_targets);
       tree::TrainParam param;
       param.Init(Args{{"learning_rate", std::to_string(eta)}});
       HostDeviceVector<bst_node_t> position;
@@ -192,15 +192,15 @@ TEST_F(TestSplitWithEta, GpuApprox) {
 
 class TestMinSplitLoss : public ::testing::Test {
   std::shared_ptr<DMatrix> dmat_;
-  linalg::Matrix<GradientPair> gpair_;
+  GradientContainer gpair_;
 
   void SetUp() override {
     constexpr size_t kRows = 32;
     constexpr size_t kCols = 16;
     constexpr float kSparsity = 0.6;
     dmat_ = RandomDataGenerator(kRows, kCols, kSparsity).Seed(3).GenerateDMatrix();
-    gpair_.Reshape(kRows, 1);
-    gpair_.Data()->Copy(GenerateRandomGradients(kRows));
+    Context ctx;
+    gpair_ = GenerateRandomGradients(&ctx, kRows, 1);
   }
 
   std::int32_t Update(Context const* ctx, std::string updater, float gamma) {
diff --git a/tests/python-gpu/test_gpu_multi_target.py b/tests/python-gpu/test_gpu_multi_target.py
index 70acdf34924e..96c4d2c0cc18 100644
--- a/tests/python-gpu/test_gpu_multi_target.py
+++ b/tests/python-gpu/test_gpu_multi_target.py
@@ -1,4 +1,8 @@
-from xgboost.testing.multi_target import run_multiclass, run_multilabel
+from xgboost.testing.multi_target import (
+    run_multiclass,
+    run_multilabel,
+    run_reduced_grad,
+)
 
 
 def test_multiclass() -> None:
@@ -9,3 +13,7 @@ def test_multiclass() -> None:
 def test_multilabel() -> None:
     # learning_rate is not yet supported.
     run_multilabel("cuda", 1.0)
+
+
+def test_reduced_grad() -> None:
+    run_reduced_grad("cuda")

From e739915772c4ec0e527c485b61854198fd8b59bb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 12 Nov 2025 07:29:30 +0800
Subject: [PATCH 222/224] Cleanup linalg kernels. (#11802)

- Split up two different transform kernels.
- Handle the dispatching in the header instead of the client code.
- Use thrust transform for transform kernels.
---
 include/xgboost/linalg.h                  |   4 +-
 plugin/sycl/common/linalg_op.cc           |  20 --
 plugin/sycl/common/linalg_op.h            |  14 --
 plugin/sycl/tree/hist_updater.cc          |  14 +-
 plugin/sycl/tree/hist_updater.h           |   6 +-
 plugin/sycl/tree/updater_quantile_hist.cc |  18 +-
 plugin/sycl/tree/updater_quantile_hist.h  |   4 +-
 src/common/linalg_op.cu                   |   9 +-
 src/common/linalg_op.cuh                  |  84 ++++----
 src/common/linalg_op.h                    | 221 ++++++++++++++++------
 src/data/data.cc                          |   2 +-
 src/data/data.cu                          |  19 +-
 src/objective/aft_obj.cu                  |  10 +-
 src/objective/hinge.cu                    |  12 +-
 src/objective/multiclass_obj.cu           |   5 -
 src/objective/quantile_obj.cu             |   5 -
 src/objective/regression_obj.cu           |  11 +-
 tests/cpp/common/test_linalg.cc           |  14 +-
 tests/cpp/common/test_linalg.cu           |  17 +-
 tests/cpp/common/test_linalg.h            |  28 +++
 tests/cpp/common/test_stats.cu            |   9 +-
 tests/cpp/data/test_metainfo.h            |  27 +--
 tests/cpp/test_learner.cc                 |   4 +-
 23 files changed, 327 insertions(+), 230 deletions(-)
 create mode 100644 tests/cpp/common/test_linalg.h

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 2ce60ffdcb68..d1b686b953f6 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -273,7 +273,7 @@ enum Order : std::uint8_t {
  * some functions expect data types that can be used in everywhere (update prediction
  * cache for example).
  */
-template <typename T, int32_t kDim>
+template <typename T, std::int32_t kDim>
 class TensorView {
  public:
   using ShapeT = std::size_t[kDim];
@@ -300,7 +300,7 @@ class TensorView {
     }
   }
 
-  template <size_t old_dim, size_t new_dim, int32_t D, typename I>
+  template <size_t old_dim, size_t new_dim, std::int32_t D, typename I>
   LINALG_HD size_t MakeSliceDim(std::size_t new_shape[D], std::size_t new_stride[D],
                                 detail::RangeTag<I> &&range) const {
     static_assert(new_dim < D);
diff --git a/plugin/sycl/common/linalg_op.cc b/plugin/sycl/common/linalg_op.cc
index 55eca035ced8..387b01baa5c9 100644
--- a/plugin/sycl/common/linalg_op.cc
+++ b/plugin/sycl/common/linalg_op.cc
@@ -12,7 +12,6 @@
 #include <sycl/sycl.hpp>
 
 namespace xgboost::sycl::linalg {
-
 void SmallHistogram(Context const* ctx, xgboost::linalg::MatrixView<float const> indices,
                     xgboost::common::OptionalWeights const& weights,
                     xgboost::linalg::VectorView<float> bins) {
@@ -30,23 +29,4 @@ void SmallHistogram(Context const* ctx, xgboost::linalg::MatrixView<float const>
     });
   }).wait();
 }
-
-void VecScaMul(Context const* ctx, xgboost::linalg::VectorView<float> x, double mul) {
-  sycl::DeviceManager device_manager;
-  auto* qu = device_manager.GetQueue(ctx->Device());
-
-  qu->submit([&](::sycl::handler& cgh) {
-    cgh.parallel_for<>(::sycl::range<1>(x.Size()),
-                       [=](::sycl::id<1> pid) {
-      const size_t i = pid[0];
-      const_cast<float&>(x(i)) *= mul;
-    });
-  }).wait();
-}
 }  // namespace xgboost::sycl::linalg
-
-namespace xgboost::linalg::sycl_impl {
-void VecScaMul(Context const* ctx, xgboost::linalg::VectorView<float> x, double mul) {
-  xgboost::sycl::linalg::VecScaMul(ctx, x, mul);
-}
-}  // namespace xgboost::linalg::sycl_impl
diff --git a/plugin/sycl/common/linalg_op.h b/plugin/sycl/common/linalg_op.h
index 1439408093be..e246b73265d5 100644
--- a/plugin/sycl/common/linalg_op.h
+++ b/plugin/sycl/common/linalg_op.h
@@ -8,8 +8,6 @@
 #include <vector>
 #include <utility>
 
-#include "../../../src/common/linalg_op.h"
-
 #include "../data.h"
 #include "../device_manager.h"
 
@@ -99,17 +97,5 @@ bool Validate(DeviceOrd device, TensorView<T, D> t, Fn&& fn) {
 
 }  // namespace linalg
 }  // namespace sycl
-
-namespace linalg {
-template <typename T, int32_t D, typename Fn>
-void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
-  if (ctx->IsSycl()) {
-    sycl::linalg::ElementWiseKernel(t, fn);
-  } else {
-    ElementWiseKernelHost(t, ctx->Threads(), fn);
-  }
-}
-
-}  // namespace linalg
 }  // namespace xgboost
 #endif  // PLUGIN_SYCL_COMMON_LINALG_OP_H_
diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
index 67a813d5fc76..498f86590338 100644
--- a/plugin/sycl/tree/hist_updater.cc
+++ b/plugin/sycl/tree/hist_updater.cc
@@ -12,6 +12,7 @@
 #include "../../src/tree/common_row_partitioner.h"
 
 #include "../common/hist_util.h"
+#include "xgboost/linalg.h"
 #include "../../src/collective/allreduce.h"
 
 namespace xgboost {
@@ -34,8 +35,8 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
     qu_->memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
   }
 
-  auto buffer_vec = linalg::MakeVec(reinterpret_cast<GradientSumT*>(reduce_buffer_.data()),
-                                    2 * nbins * sync_ids.size());
+  auto buffer_vec = ::xgboost::linalg::MakeVec(
+      reinterpret_cast<GradientSumT*>(reduce_buffer_.data()), 2 * nbins * sync_ids.size());
   auto rc = collective::Allreduce(ctx_, buffer_vec, collective::Op::kSum);
   SafeColl(rc);
 
@@ -361,10 +362,9 @@ void HistUpdater<GradientSumT>::Update(
   builder_monitor_.Stop("Update");
 }
 
-template<typename GradientSumT>
+template <typename GradientSumT>
 bool HistUpdater<GradientSumT>::UpdatePredictionCache(
-    const DMatrix* data,
-    linalg::MatrixView<float> out_preds) {
+    const DMatrix* data, ::xgboost::linalg::MatrixView<float> out_preds) {
   CHECK(out_preds.Device().IsSycl());
   // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
   // conjunction with Update().
@@ -723,8 +723,8 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
         }).wait_and_throw();
       }
       auto rc = collective::Allreduce(
-                      ctx_, linalg::MakeVec(reinterpret_cast<GradientSumT*>(&grad_stat), 2),
-                      collective::Op::kSum);
+          ctx_, ::xgboost::linalg::MakeVec(reinterpret_cast<GradientSumT*>(&grad_stat), 2),
+          collective::Op::kSum);
       SafeColl(rc);
       snode_host_[nid].stats = grad_stat;
     } else {
diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h
index 6d65bd1fb51e..37ff6a8b3e9b 100644
--- a/plugin/sycl/tree/hist_updater.h
+++ b/plugin/sycl/tree/hist_updater.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2024 by Contributors
+ * Copyright 2017-2025, XGBoost Contributors
  * \file hist_updater.h
  */
 #ifndef PLUGIN_SYCL_TREE_HIST_UPDATER_H_
@@ -8,6 +8,7 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wtautological-constant-compare"
 #pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <xgboost/linalg.h>  // for MatrixView
 #include <xgboost/tree_updater.h>
 #pragma GCC diagnostic pop
 
@@ -80,8 +81,7 @@ class HistUpdater {
               xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
               RegTree *p_tree);
 
-  bool UpdatePredictionCache(const DMatrix* data,
-                             linalg::MatrixView<float> p_out_preds);
+  bool UpdatePredictionCache(const DMatrix* data, ::xgboost::linalg::MatrixView<float> p_out_preds);
 
   void SetHistSynchronizer(HistSynchronizer<GradientSumT>* sync);
   void SetHistRowsAdder(HistRowsAdder<GradientSumT>* adder);
diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
index 5aa89f6222a1..b8207bbaa676 100644
--- a/plugin/sycl/tree/updater_quantile_hist.cc
+++ b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -60,14 +60,12 @@ void QuantileHistMaker::SetPimpl(std::unique_ptr<HistUpdater<GradientSumT>>* pim
   }
 }
 
-template<typename GradientSumT>
-void QuantileHistMaker::CallUpdate(
-        const std::unique_ptr<HistUpdater<GradientSumT>>& pimpl,
-        xgboost::tree::TrainParam const *param,
-        linalg::Matrix<GradientPair> *gpair,
-        DMatrix *dmat,
-        xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
-        const std::vector<RegTree *> &trees) {
+template <typename GradientSumT>
+void QuantileHistMaker::CallUpdate(const std::unique_ptr<HistUpdater<GradientSumT>> &pimpl,
+                                   xgboost::tree::TrainParam const *param,
+                                   ::xgboost::linalg::Matrix<GradientPair> *gpair, DMatrix *dmat,
+                                   xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
+                                   const std::vector<RegTree *> &trees) {
   for (auto tree : trees) {
     pimpl->Update(param, gmat_, *(gpair->Data()), dmat, out_position, tree);
   }
@@ -107,8 +105,8 @@ void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param, GradientC
   p_last_dmat_ = dmat;
 }
 
-bool QuantileHistMaker::UpdatePredictionCache(const DMatrix* data,
-                                              linalg::MatrixView<float> out_preds) {
+bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
+                                              ::xgboost::linalg::MatrixView<float> out_preds) {
   if (param_.subsample < 1.0f) return false;
 
   if (hist_precision_ == HistPrecision::fp32) {
diff --git a/plugin/sycl/tree/updater_quantile_hist.h b/plugin/sycl/tree/updater_quantile_hist.h
index d89b07d80ccf..b6b2105ff1f8 100644
--- a/plugin/sycl/tree/updater_quantile_hist.h
+++ b/plugin/sycl/tree/updater_quantile_hist.h
@@ -53,7 +53,7 @@ class QuantileHistMaker: public TreeUpdater {
               const std::vector<RegTree*>& trees) override;
 
   bool UpdatePredictionCache(const DMatrix* data,
-                             linalg::MatrixView<float> out_preds) override;
+                             ::xgboost::linalg::MatrixView<float> out_preds) override;
 
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
@@ -90,7 +90,7 @@ class QuantileHistMaker: public TreeUpdater {
   template<typename GradientSumT>
   void CallUpdate(const std::unique_ptr<HistUpdater<GradientSumT>>& builder,
                   xgboost::tree::TrainParam const *param,
-                  linalg::Matrix<GradientPair> *gpair,
+                  ::xgboost::linalg::Matrix<GradientPair> *gpair,
                   DMatrix *dmat,
                   xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
                   const std::vector<RegTree *> &trees);
diff --git a/src/common/linalg_op.cu b/src/common/linalg_op.cu
index 7db3316deb77..718f0193300c 100644
--- a/src/common/linalg_op.cu
+++ b/src/common/linalg_op.cu
@@ -1,9 +1,7 @@
 /**
  * Copyright 2025, XGBoost Contributors
  */
-#include <thrust/for_each.h>                    // for for_each_n
-#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
-#include <thrust/scan.h>                        // for inclusive_scan
+#include <thrust/scan.h>  // for inclusive_scan
 
 #include <cstddef>  // for size_t
 
@@ -15,11 +13,6 @@
 #include "xgboost/linalg.h"   // for VectorView
 
 namespace xgboost::linalg::cuda_impl {
-void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul) {
-  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), x.Size(),
-                     [=] XGBOOST_DEVICE(std::size_t i) mutable { x(i) = x(i) * mul; });
-}
-
 void SmallHistogram(Context const* ctx, linalg::MatrixView<float const> indices,
                     common::OptionalWeights const& d_weights, linalg::VectorView<float> bins) {
   auto n_bins = bins.Size();
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 9a15cfc4d95a..a0b8397cb52b 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -4,15 +4,21 @@
 #ifndef XGBOOST_COMMON_LINALG_OP_CUH_
 #define XGBOOST_COMMON_LINALG_OP_CUH_
 
-#include <cstdint>  // for int32_t
-#include <cstdlib>  // for size_t
-#include <tuple>    // for apply
+#include <thrust/iterator/counting_iterator.h>  // for counting_iterator
+#include <thrust/iterator/zip_iterator.h>       // for make_zip_iterator
+#include <thrust/transform.h>                   // for transform
+
+#include <cstdint>            // for int32_t
+#include <cstdlib>            // for size_t
+#include <cuda/std/iterator>  // for iterator_traits
+#include <cuda/std/tuple>     // for get
+#include <tuple>              // for apply
 
 #include "cuda_context.cuh"
 #include "device_helpers.cuh"  // for LaunchN
-#include "linalg_op.h"
-#include "xgboost/context.h"  // for Context
-#include "xgboost/linalg.h"   // for TensorView
+#include "type.h"              // for GetValueT
+#include "xgboost/context.h"   // for Context
+#include "xgboost/linalg.h"    // for TensorView
 
 namespace xgboost::linalg {
 namespace cuda_impl {
@@ -40,17 +46,22 @@ struct ElementWiseImpl<T, 1> {
 template <typename T, std::int32_t D, typename Fn>
 void ElementWiseKernel(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
   dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
-  cuda_impl::ElementWiseImpl<T, D>{}(t, fn, s);
+  ElementWiseImpl<T, D>{}(t, fn, s);
 }
 
-void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul);
-}  // namespace cuda_impl
-
-template <typename T, int32_t D, typename Fn>
-void ElementWiseTransformDevice(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+template <typename T, std::int32_t D, typename Fn>
+void TransformIdxKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
+  auto s = ctx->CUDACtx()->Stream();
   if (t.Contiguous()) {
     auto ptr = t.Values().data();
-    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
+    auto it =
+        thrust::make_zip_iterator(thrust::make_counting_iterator(static_cast<std::size_t>(0)), ptr);
+    using Tuple = typename cuda::std::iterator_traits<common::GetValueT<decltype(it)>>::value_type;
+    thrust::transform(ctx->CUDACtx()->CTP(), it, it + t.Size(), ptr,
+                      [=] XGBOOST_DEVICE(Tuple const& tup) {
+                        return fn(cuda::std::get<0>(tup), cuda::std::get<1>(tup));
+                      });
   } else {
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
       T& v = std::apply(t, UnravelIndex(i, t.Shape()));
@@ -59,44 +70,53 @@ void ElementWiseTransformDevice(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nu
   }
 }
 
-template <typename T, int32_t D, typename Fn>
-void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
-  ctx->IsCUDA() ? cuda_impl::ElementWiseKernel(t, fn)
-                : ElementWiseKernelHost(t, ctx->Threads(), fn);
+template <typename T, std::int32_t D, typename Fn>
+void TransformKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
+  auto s = ctx->CUDACtx()->Stream();
+  if (t.Contiguous()) {
+    auto ptr = t.Values().data();
+    thrust::transform(ctx->CUDACtx()->CTP(), ptr, ptr + t.Size(), ptr,
+                      [=] XGBOOST_DEVICE(T const& v) { return fn(v); });
+  } else {
+    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
+      T& v = std::apply(t, UnravelIndex(i, t.Shape()));
+      v = fn(v);
+    });
+  }
 }
+}  // namespace cuda_impl
 
 namespace detail {
-template <typename T, std::int32_t kDim>
+template <typename T, std::int32_t D>
 struct IterOp {
-  TensorView<T, kDim> v;
-  XGBOOST_DEVICE T& operator()(std::size_t i) {
-    return std::apply(v, UnravelIndex(i, v.Shape()));
-  }
+  TensorView<T, D> v;
+  XGBOOST_DEVICE T& operator()(std::size_t i) { return std::apply(v, UnravelIndex(i, v.Shape())); }
 };
 }  // namespace detail
 
 // naming: thrust begin
 // returns a thrust iterator for a tensor view.
-template <typename T, std::int32_t kDim>
-auto tcbegin(TensorView<T, kDim> v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto tcbegin(TensorView<T, D> v) {  // NOLINT
   return thrust::make_transform_iterator(
       thrust::make_counting_iterator(0ul),
-      detail::IterOp<std::add_const_t<std::remove_const_t<T>>, kDim>{v});
+      detail::IterOp<std::add_const_t<std::remove_const_t<T>>, D>{v});
 }
 
-template <typename T, std::int32_t kDim>
-auto tcend(TensorView<T, kDim> v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto tcend(TensorView<T, D> v) {  // NOLINT
   return tcbegin(v) + v.Size();
 }
 
-template <typename T, std::int32_t kDim>
-auto tbegin(TensorView<T, kDim> v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto tbegin(TensorView<T, D> v) {  // NOLINT
   return thrust::make_transform_iterator(thrust::make_counting_iterator(0ul),
-                                         detail::IterOp<std::remove_const_t<T>, kDim>{v});
+                                         detail::IterOp<std::remove_const_t<T>, D>{v});
 }
 
-template <typename T, std::int32_t kDim>
-auto tend(TensorView<T, kDim> v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto tend(TensorView<T, D> v) {  // NOLINT
   return tbegin(v) + v.Size();
 }
 }  // namespace xgboost::linalg
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index ef5e4dec00b5..d741c324fd9b 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -1,5 +1,13 @@
 /**
  * Copyright 2021-2025, XGBoost Contributors
+ *
+ * @brief This module defines the dispatching functions for various linalg kernels.
+ *
+ * Client code can use utilities like @ref ElementWiseKernel by including this file in the
+ * right translation unit. For CUDA-compatible kernels, include this header in a .cu TU.
+ *
+ * Be aware of potential violation of the one definition rule (ODR). The dispatching
+ * functions should never be used in an inline function without a system tag.
  */
 #ifndef XGBOOST_COMMON_LINALG_OP_H_
 #define XGBOOST_COMMON_LINALG_OP_H_
@@ -15,6 +23,16 @@
 #include "xgboost/json.h"        // for Json
 #include "xgboost/linalg.h"
 
+#if defined(__CUDACC__)
+#include <utility>  // for forward
+
+#include "linalg_op.cuh"
+#endif
+
+#if defined(XGBOOST_USE_SYCL)
+#include "../../plugin/sycl/common/linalg_op.h"
+#endif
+
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
 
 #include "common.h"           // for AssertGPUSupport
@@ -27,8 +45,9 @@ struct OptionalWeights;
 }
 
 namespace xgboost::linalg {
-template <typename T, int32_t D, typename Fn>
-void ElementWiseTransformHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& fn) {
+namespace cpu_impl {
+template <typename T, std::int32_t D, typename Fn>
+void TransformIdxKernel(linalg::TensorView<T, D> t, std::int32_t n_threads, Fn&& fn) {
   if (t.Contiguous()) {
     auto ptr = t.Values().data();
     common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) { ptr[i] = fn(i, ptr[i]); });
@@ -41,7 +60,20 @@ void ElementWiseTransformHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&
 }
 
 template <typename T, std::int32_t D, typename Fn>
-void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, Fn&& fn) {
+void TransformKernel(linalg::TensorView<T, D> t, std::int32_t n_threads, Fn&& fn) {
+  if (t.Contiguous()) {
+    auto ptr = t.Values().data();
+    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) { ptr[i] = fn(ptr[i]); });
+  } else {
+    common::ParallelFor(t.Size(), n_threads, [&](std::size_t i) {
+      auto& v = std::apply(t, linalg::UnravelIndex(i, t.Shape()));
+      v = fn(v);
+    });
+  }
+}
+
+template <typename T, std::int32_t D, typename Fn>
+void ElementWiseKernel(linalg::TensorView<T, D> t, std::int32_t n_threads, Fn&& fn) {
   constexpr std::size_t kBlockSize = 2048;
   if constexpr (D == 1) {
     common::ParallelFor1d<kBlockSize>(t.Size(), n_threads, [&](auto&& block) {
@@ -68,90 +100,165 @@ void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, F
     });
   }
 }
+}  // namespace cpu_impl
 
-#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
-template <typename T, int32_t D, typename Fn>
-void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
-  common::AssertGPUSupport();
-}
-
-template <typename T, int32_t D, typename Fn>
-void ElementWiseTransformDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
-  common::AssertGPUSupport();
-}
-
-template <typename T, int32_t D, typename Fn>
-void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  if (ctx->IsCUDA()) {
-    common::AssertGPUSupport();
-  }
-  ElementWiseKernelHost(t, ctx->Threads(), fn);
-}
-#endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
-
-template <typename T, std::int32_t kDim>
-auto cbegin(TensorView<T, kDim> const& v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto cbegin(TensorView<T, D> const& v) {  // NOLINT
   auto it = common::MakeIndexTransformIter([&](std::size_t i) -> std::remove_cv_t<T> const& {
     return std::apply(v, linalg::UnravelIndex(i, v.Shape()));
   });
   return it;
 }
 
-template <typename T, std::int32_t kDim>
-auto cend(TensorView<T, kDim> const& v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto cend(TensorView<T, D> const& v) {  // NOLINT
   return cbegin(v) + v.Size();
 }
 
-template <typename T, std::int32_t kDim>
-auto begin(TensorView<T, kDim>& v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto begin(TensorView<T, D>& v) {  // NOLINT
   auto it = common::MakeIndexTransformIter(
       [&](std::size_t i) -> T& { return std::apply(v, linalg::UnravelIndex(i, v.Shape())); });
   return it;
 }
 
-template <typename T, std::int32_t kDim>
-auto end(TensorView<T, kDim>& v) {  // NOLINT
+template <typename T, std::int32_t D>
+auto end(TensorView<T, D>& v) {  // NOLINT
   return begin(v) + v.Size();
 }
 
-namespace cuda_impl {
-void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul);
-}  // namespace cuda_impl
+namespace detail {
+using SysTagImpl = std::int32_t;
+// Magic for complying with the ODR.
+#if defined(__CUDACC__)
+constexpr SysTagImpl SysTag() { return 0; }
+#elif defined(XGBOOST_USE_SYCL)
+constexpr SysTagImpl SysTag() { return 1; }
+#else
+constexpr SysTagImpl SysTag() { return 2; }
+#endif
+}  // namespace detail
 
-namespace sycl_impl {
-void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul);
-}  // namespace sycl_impl
+/**
+ * @brief Elementwise kernel without a return type.
+ *
+ * @tparam T  Element type of the input array.
+ * @tparam D  Number of dimension of the input array.
+ * @tparam Fn Transformation function.
+ *
+ * @param t  Input array.
+ * @param fn Transformation function.
+ */
+#if defined(__CUDACC__)
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  ctx->DispatchDevice(
+      [&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+      [&] { cuda_impl::ElementWiseKernel(t, std::forward<Fn>(fn), ctx->CUDACtx()->Stream()); });
+}
+#elif defined(XGBOOST_USE_SYCL)
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+                      [&] { LOG(FATAL) << "Invalid TU"; },
+                      [&] { ::xgboost::sycl::linalg::ElementWiseKernel(t, std::forward<Fn>(fn)); });
+}
+#else
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  CHECK(ctx->IsCPU());
+  ctx->DispatchDevice([&] { cpu_impl::ElementWiseKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+                      [&] { LOG(FATAL) << "Invalid TU"; });
+}
+#endif
 
-// vector-scalar multiplication
-inline void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul) {
-  CHECK_EQ(x.Device().ordinal, ctx->Device().ordinal);
-  if (x.Device().IsCUDA()) {
-#if defined(XGBOOST_USE_CUDA)
-    cuda_impl::VecScaMul(ctx, x, mul);
+/**
+ * @brief Elementwise transform, with element index and the element itself as input.
+ *
+ * @tparam T  Element type of the input array.
+ * @tparam D  Number of dimension of the input array.
+ * @tparam Fn Transformation function, must return type T.
+ *
+ * @param t  Input array.
+ * @param fn Transformation function, must return type T.
+ */
+#if defined(__CUDACC__)
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void TransformIdxKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  ctx->DispatchDevice(
+      [&] { cpu_impl::TransformIdxKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+      [&] { cuda_impl::TransformIdxKernel(ctx, t, std::forward<Fn>(fn)); });
+}
+#elif defined(XGBOOST_USE_SYCL)
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void TransformIdxKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  ctx->DispatchDevice(
+      [&] { cpu_impl::TransformIdxKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+      [&] { LOG(FATAL) << "Invalid TU."; },
+      [&] {
+        static_assert(D == 1, "Not implemented.");
+        sycl::linalg::ElementWiseKernel(t, [=](std::size_t i) mutable { t(i) = fn(i, t(i)); });
+      });
+}
 #else
-    common::AssertGPUSupport();
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void TransformIdxKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  CHECK(ctx->IsCPU());
+  ctx->DispatchDevice(
+      [&] { cpu_impl::TransformIdxKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+      [&] { LOG(FATAL) << "Invalid TU."; });
+}
 #endif
-  } else if (x.Device().IsSycl()) {
-#if defined(XGBOOST_USE_SYCL)
-    sycl_impl::VecScaMul(ctx, x, mul);
+
+/**
+ * @brief Elementwise transform, with the element itself as input. Rest is the same as @ref
+ * TransformIdxKernel
+ */
+#if defined(__CUDACC__)
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void TransformKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  ctx->DispatchDevice([&] { cpu_impl::TransformKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+                      [&] { cuda_impl::TransformKernel(ctx, t, std::forward<Fn>(fn)); });
+}
+#elif defined(XGBOOST_USE_SYCL)
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void TransformKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  ctx->DispatchDevice([&] { cpu_impl::TransformKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+                      [&] { LOG(FATAL) << "Invalid TU."; },
+                      [&] {
+                        static_assert(D == 1, "Not implemented.");
+                        sycl::linalg::ElementWiseKernel(
+                            t, [=](std::size_t i) mutable { t(i) = fn(t(i)); });
+                      });
+}
 #else
-    common::AssertSYCLSupport();
+template <typename T, std::int32_t D, typename Fn, auto _tag = detail::SysTag()>
+void TransformKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
+  CHECK(ctx->IsCPU());
+  ctx->DispatchDevice([&] { cpu_impl::TransformKernel(t, ctx->Threads(), std::forward<Fn>(fn)); },
+                      [&] { LOG(FATAL) << "Invalid TU."; });
+}
 #endif
-  } else {
-    constexpr std::size_t kBlockSize = 2048;
-    common::ParallelFor1d<kBlockSize>(x.Size(), ctx->Threads(), [&](auto&& block) {
-      for (auto i = block.begin(); i < block.end(); ++i) {
-        x(i) *= mul;
-      }
-    });
-  }
+
+// vector-scalar multiplication
+template <auto _tag = detail::SysTag()>
+void VecScaMul(Context const* ctx, linalg::VectorView<float> x, double mul) {
+  CHECK_EQ(x.Device().ordinal, ctx->Device().ordinal);
+  TransformKernel(ctx, x, [=] XGBOOST_DEVICE(float v) { return v * mul; });
 }
 
 // vector-scalar division
-inline void VecScaDiv(Context const* ctx, linalg::VectorView<float> x, double div) {
+template <auto _tag = detail::SysTag()>
+void VecScaDiv(Context const* ctx, linalg::VectorView<float> x, double div) {
   return VecScaMul(ctx, x, 1.0 / div);
 }
 
+template <auto _tag = detail::SysTag()>
+void LogE(Context const* ctx, linalg::VectorView<float> x) {
+  CHECK_EQ(x.Device().ordinal, ctx->Device().ordinal);
+  TransformKernel(ctx, x, [=] XGBOOST_DEVICE(float v) { return log(v); });
+}
+
 template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
 void SaveVector(linalg::Vector<T> const& in, Json* p_out) {
   ::xgboost::SaveVector(in.Data()->HostVector(), p_out);
diff --git a/src/data/data.cc b/src/data/data.cc
index 4c5d990d91ff..fa0545d09b08 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -505,7 +505,7 @@ void CopyTensorInfoImpl(Context const* ctx, Json arr_interface, linalg::Tensor<T
   CHECK(t_out.CContiguous());
   auto const shape = t_out.Shape();
   DispatchDType(array, DeviceOrd::CPU(), [&](auto&& in) {
-    linalg::ElementWiseTransformHost(t_out, ctx->Threads(), [&](auto i, auto) {
+    linalg::cpu_impl::TransformIdxKernel(t_out, ctx->Threads(), [&](auto i, auto) {
       return std::apply(in, linalg::UnravelIndex<D>(i, shape));
     });
   });
diff --git a/src/data/data.cu b/src/data/data.cu
index 1d550bf22a66..6f605777e02b 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -30,7 +30,7 @@ auto SetDeviceToPtr(void const* ptr) {
 }
 
 template <typename T, int32_t D>
-void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
+void CopyTensorInfoImpl(Context const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
   ArrayInterface<D> array(arr_interface);
   if (array.n == 0) {
     p_out->SetDevice(DeviceOrd::CUDA(0));
@@ -49,18 +49,15 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
       // set data
       data->Resize(array.n);
       dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
-                                    cudaMemcpyDefault, ctx->Stream()));
+                                    cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
     });
     return;
   }
   p_out->Reshape(array.shape);
   auto t = p_out->View(ptr_device);
-  linalg::ElementWiseTransformDevice(
-      t,
-      [=] __device__(size_t i, T) {
-        return std::apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, array.shape));
-      },
-      ctx->Stream());
+  linalg::cuda_impl::TransformIdxKernel(ctx, t, [=] XGBOOST_DEVICE(std::size_t i, T) {
+    return std::apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, array.shape));
+  });
 }
 
 void CopyGroupInfoImpl(ArrayInterface<1> column, std::vector<bst_group_t>* out) {
@@ -123,10 +120,10 @@ void MetaInfo::SetInfoFromCUDA(Context const* ctx, StringView key, Json array) {
   // multi-dim float info
   auto cuctx = ctx->CUDACtx();
   if (key == "base_margin") {
-    CopyTensorInfoImpl(cuctx, array, &base_margin_);
+    CopyTensorInfoImpl(ctx, array, &base_margin_);
     return;
   } else if (key == "label") {
-    CopyTensorInfoImpl(cuctx, array, &labels);
+    CopyTensorInfoImpl(ctx, array, &labels);
     auto ptr = labels.Data()->ConstDevicePointer();
     auto valid = thrust::none_of(cuctx->CTP(), ptr, ptr + labels.Size(), data::LabelsCheck{});
     CHECK(valid) << "Label contains NaN, infinity or a value too large.";
@@ -146,7 +143,7 @@ void MetaInfo::SetInfoFromCUDA(Context const* ctx, StringView key, Json array) {
   }
   // float info
   linalg::Tensor<float, 1> t;
-  CopyTensorInfoImpl(cuctx, array, &t);
+  CopyTensorInfoImpl(ctx, array, &t);
   if (key == "weight") {
     this->weights_ = std::move(*t.Data());
     auto ptr = weights_.ConstDevicePointer();
diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu
index 7c69af1e9f5c..f535fa0aecae 100644
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -8,7 +8,7 @@
 #include <cmath>    // for log
 #include <cstddef>  // for size_t
 
-
+#include "../common/linalg_op.h"  // for ElementWiseKernel
 #include "../common/survival_util.h"
 #include "../common/transform.h"
 #include "xgboost/host_device_vector.h"
@@ -17,14 +17,6 @@
 #include "xgboost/objective.h"
 #include "xgboost/span.h"
 
-#if defined(XGBOOST_USE_CUDA)
-#include "../common/linalg_op.cuh"  // for ElementWiseKernel
-#elif defined(XGBOOST_USE_SYCL)
-#include "../../plugin/sycl/common/linalg_op.h"
-#else
-#include "../common/linalg_op.h"  // for ElementWiseKernel
-#endif
-
 using AFTParam = xgboost::common::AFTParam;
 using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType;
 template <typename Distribution>
diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu
index a850df09ea06..285f65c6f4f5 100644
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023, XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  * \file hinge.cc
  * \brief Provides an implementation of the hinge loss function
  * \author Henry Gouk
@@ -8,14 +8,8 @@
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t
 
-#include "../common/common.h"  // for Range
-#if defined(XGBOOST_USE_CUDA)
-#include "../common/linalg_op.cuh"
-#endif
-#if defined(XGBOOST_USE_SYCL)
-#include "../../plugin/sycl/common/linalg_op.h"
-#endif
-#include "../common/linalg_op.h"
+#include "../common/common.h"            // for Range
+#include "../common/linalg_op.h"         // for ElementWiseKernel
 #include "../common/optional_weight.h"   // for OptionalWeights
 #include "../common/transform.h"         // for Transform
 #include "init_estimation.h"             // for FitIntercept
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 5e3622ac0202..118a2bf71a79 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -24,14 +24,9 @@
 
 #include "../common/algorithm.cuh"     // for AllOf
 #include "../common/cuda_context.cuh"  // for CUDAContext
-#include "../common/linalg_op.cuh"     // for tcbegin
 
 #endif  // defined(XGBOOST_USE_CUDA)
 
-#if defined(XGBOOST_USE_SYCL)
-#include "../../plugin/sycl/common/linalg_op.h"
-#endif
-
 #include "multiclass_param.h"
 
 namespace xgboost::obj {
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 05b4627ea85d..dbe25e72d735 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -20,15 +20,10 @@
 
 #if defined(XGBOOST_USE_CUDA)
 
-#include "../common/linalg_op.cuh"  // ElementWiseKernel
 #include "../common/stats.cuh"      // SegmentedQuantile
 
 #endif                              // defined(XGBOOST_USE_CUDA)
 
-#if defined(XGBOOST_USE_SYCL)
-#include "../../plugin/sycl/common/linalg_op.h"  // ElementWiseKernel
-#endif
-
 namespace xgboost::obj {
 class QuantileRegression : public ObjFunction {
   common::QuantileLossParam param_;
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 26b660d31554..122002f04a6b 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -12,9 +12,9 @@
 #include <vector>   // for vector
 
 #include "../common/common.h"
-#include "../common/linalg_op.h"
-#include "../common/numeric.h"          // Reduce
-#include "../common/optional_weight.h"  // OptionalWeights
+#include "../common/linalg_op.h"        // for ElementWiseKernel
+#include "../common/numeric.h"          // for Reduce
+#include "../common/optional_weight.h"  // for OptionalWeights
 #include "../common/pseudo_huber.h"
 #include "../common/stats.h"
 #include "../common/threading_utils.h"
@@ -40,13 +40,8 @@
 #include "../common/algorithm.cuh"       // for AllOf
 #include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for MakeIndexTransformIter
-#include "../common/linalg_op.cuh"
 #endif  // defined(XGBOOST_USE_CUDA)
 
-#if defined(XGBOOST_USE_SYCL)
-#include "../../plugin/sycl/common/linalg_op.h"
-#endif
-
 namespace xgboost::obj {
 namespace {
 void CheckRegInputs(MetaInfo const& info, HostDeviceVector<float> const& preds) {
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index c3dba9375180..2d05ec5b0f53 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -11,6 +11,7 @@
 #include <vector>   // for vector
 
 #include "../../../src/common/linalg_op.h"
+#include "test_linalg.h"  // for TestLinalgDispatch
 
 namespace xgboost::linalg {
 namespace {
@@ -329,11 +330,11 @@ TEST(Linalg, Popc) {
 
 TEST(Linalg, Stack) {
   Tensor<float, 3> l{{2, 3, 4}, CPU(), Order::kC};
-  ElementWiseTransformHost(l.View(CPU()), omp_get_max_threads(),
-                           [=](size_t i, float) { return i; });
+  cpu_impl::TransformIdxKernel(l.View(CPU()), omp_get_max_threads(),
+                               [=](size_t i, float) { return i; });
   Tensor<float, 3> r_0{{2, 3, 4}, CPU(), Order::kC};
-  ElementWiseTransformHost(r_0.View(CPU()), omp_get_max_threads(),
-                           [=](size_t i, float) { return i; });
+  cpu_impl::TransformIdxKernel(r_0.View(CPU()), omp_get_max_threads(),
+                               [=](size_t i, float) { return i; });
 
   Stack(&l, r_0);
 
@@ -410,4 +411,9 @@ TEST(Linalg, IO) {
     check(loaded);
   }
 }
+
+TEST(Linalg, CpuDispatch) {
+  Context ctx;
+  TestLinalgDispatch(&ctx, [](auto v) { return v + 1; });
+}
 }  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index 6ec19f41fc82..6a34513db5b1 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -7,9 +7,10 @@
 #include <thrust/sequence.h>                    // for sequence
 
 #include "../../../src/common/cuda_context.cuh"
-#include "../../../src/common/linalg_op.cuh"
+#include "../../../src/common/linalg_op.h"
 #include "../../../src/common/optional_weight.h"  // for MakeOptionalWeights
 #include "../helpers.h"
+#include "test_linalg.h"     // for TestLinalgDispatch
 #include "thrust/random.h"   // for default_random_engine
 #include "thrust/shuffle.h"  // for shuffle
 #include "xgboost/context.h"
@@ -18,7 +19,8 @@
 namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
-  auto device = DeviceOrd::CUDA(0);
+  auto ctx = MakeCUDACtx(0);
+  auto device = ctx.Device();
   Tensor<float, 3> l{{2, 3, 4}, device};
   {
     /**
@@ -27,7 +29,7 @@ void TestElementWiseKernel() {
     // GPU view
     auto t = l.View(device).Slice(linalg::All(), 1, linalg::All());
     ASSERT_FALSE(t.CContiguous());
-    ElementWiseTransformDevice(t, [] __device__(size_t i, float) { return i; });
+    cuda_impl::TransformIdxKernel(&ctx, t, [] XGBOOST_DEVICE(std::size_t i, float) { return i; });
     // CPU view
     t = l.View(DeviceOrd::CPU()).Slice(linalg::All(), 1, linalg::All());
     std::size_t k = 0;
@@ -54,7 +56,7 @@ void TestElementWiseKernel() {
      * Contiguous
      */
     auto t = l.View(device);
-    ElementWiseTransformDevice(t, [] XGBOOST_DEVICE(size_t i, float) { return i; });
+    cuda_impl::TransformIdxKernel(&ctx, t, [] XGBOOST_DEVICE(size_t i, float) { return i; });
     ASSERT_TRUE(t.CContiguous());
     // CPU view
     t = l.View(DeviceOrd::CPU());
@@ -148,4 +150,11 @@ TEST(Linalg, SmallHistogram) {
     ASSERT_EQ(h_bins[i], cnt);
   }
 }
+namespace {
+void TestGpuDispatch() {
+  auto ctx = MakeCUDACtx(0);
+  TestLinalgDispatch(&ctx, [] XGBOOST_DEVICE(double v) { return v + 1; });
+}
+}  // namespace
+TEST(Linalg, GpuDispatch) { TestGpuDispatch(); }
 }  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_linalg.h b/tests/cpp/common/test_linalg.h
new file mode 100644
index 000000000000..d79ed1422931
--- /dev/null
+++ b/tests/cpp/common/test_linalg.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#pragma once
+
+#include <gtest/gtest.h>
+#include <xgboost/context.h>
+#include <xgboost/linalg.h>  // for Vector
+
+#include <numeric>  // for iota
+#include <vector>   // for vector
+
+#include "../../../src/common/linalg_op.h"
+
+namespace xgboost::linalg {
+template <typename Fn>
+void TestLinalgDispatch(Context const* ctx, Fn&& fn) {
+  std::vector<double> data(128, 0);
+  std::iota(data.begin(), data.end(), 0.0);
+  Vector<double> vec(data.begin(), data.end(), {data.size()}, DeviceOrd::CPU());
+
+  TransformKernel(ctx, vec.View(ctx->Device()), [=] XGBOOST_DEVICE(double v) { return fn(v); });
+  auto h_v = vec.HostView();
+  for (std::size_t i = 0; i < h_v.Size(); ++i) {
+    ASSERT_EQ(h_v(i), fn(i));
+  }
+}
+}  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
index 28d4714238eb..0b149cb7092d 100644
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -7,7 +7,7 @@
 #include <utility>  // std::pair
 #include <vector>   // std::vector
 
-#include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformDevice
+#include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformKernel
 #include "../../../src/common/stats.cuh"
 #include "../helpers.h"
 #include "xgboost/base.h"                // XGBOOST_DEVICE
@@ -81,8 +81,9 @@ class StatsGPU : public ::testing::Test {
         dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                          [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
     linalg::Tensor<float, 1> weights{{10}, FstCU()};
-    linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
-                                       [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
+    linalg::cuda_impl::TransformIdxKernel(
+        &ctx_, weights.View(DeviceOrd::CUDA(0)),
+        [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
     auto w_it = weights.Data()->ConstDevicePointer();
     for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) {
       SegmentedWeightedQuantile(&ctx_, pair.first, key_it, key_it + indptr_.Size(), val_it,
diff --git a/tests/cpp/data/test_metainfo.h b/tests/cpp/data/test_metainfo.h
index 53da10dcc4f3..941b6e6577dd 100644
--- a/tests/cpp/data/test_metainfo.h
+++ b/tests/cpp/data/test_metainfo.h
@@ -30,12 +30,13 @@ inline void TestMetaInfoStridedData(DeviceOrd device) {
     auto const& h_result = info.labels.View(DeviceOrd::CPU());
     ASSERT_EQ(h_result.Shape().size(), 2);
     auto in_labels = labels.View(DeviceOrd::CPU());
-    linalg::ElementWiseKernelHost(h_result, omp_get_max_threads(), [&](size_t i, std::size_t j) {
-      // Sliced at second dimension.
-      auto v_0 = h_result(i, j);
-      auto v_1 = in_labels(i, 0, j);
-      CHECK_EQ(v_0, v_1);
-    });
+    linalg::cpu_impl::ElementWiseKernel(h_result, omp_get_max_threads(),
+                                        [&](size_t i, std::size_t j) {
+                                          // Sliced at second dimension.
+                                          auto v_0 = h_result(i, j);
+                                          auto v_1 = in_labels(i, 0, j);
+                                          CHECK_EQ(v_0, v_1);
+                                        });
   }
   {
     // qid
@@ -62,13 +63,13 @@ inline void TestMetaInfoStridedData(DeviceOrd device) {
     auto const& h_result = info.base_margin_.View(DeviceOrd::CPU());
     ASSERT_EQ(h_result.Shape().size(), 2);
     auto in_margin = base_margin.View(DeviceOrd::CPU());
-    linalg::ElementWiseKernelHost(h_result, omp_get_max_threads(),
-                                  [&](std::size_t i, std::size_t j) {
-                                    // Sliced at second dimension.
-                                    auto v_0 = h_result(i, j);
-                                    auto v_1 = in_margin(i, 0, j);
-                                    CHECK_EQ(v_0, v_1);
-                                  });
+    linalg::cpu_impl::ElementWiseKernel(h_result, omp_get_max_threads(),
+                                        [&](std::size_t i, std::size_t j) {
+                                          // Sliced at second dimension.
+                                          auto v_0 = h_result(i, j);
+                                          auto v_1 = in_margin(i, 0, j);
+                                          CHECK_EQ(v_0, v_1);
+                                        });
   }
 }
 }  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index e168f8076a7c..4fc0c4a70bab 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -431,8 +431,8 @@ TEST(Learner, MultiTarget) {
   size_t constexpr kRows{128}, kCols{10}, kTargets{3};
   auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
   m->Info().labels.Reshape(kRows, kTargets);
-  linalg::ElementWiseTransformHost(m->Info().labels.HostView(), omp_get_max_threads(),
-                                   [](auto i, auto) { return i; });
+  linalg::cpu_impl::TransformIdxKernel(m->Info().labels.HostView(), omp_get_max_threads(),
+                                       [](auto i, auto) { return i; });
 
   {
     std::unique_ptr<Learner> learner{Learner::Create({m})};

From 9255b9cd5b8964c95db5a5b987715db5b611e6da Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 13 Nov 2025 02:14:41 +0800
Subject: [PATCH 223/224] Fix loading nccl 2.28. (#11806)

---
 python-package/xgboost/collective.py | 48 +++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py
index ac75d54c51af..385276b87d2a 100644
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@@ -294,6 +294,40 @@ def signal_error() -> None:
     _check_call(_LIB.XGCommunicatorSignalError())
 
 
+def _find_nccl() -> Optional[str]:
+    from nvidia.nccl import lib
+
+    # There are two versions of nvidia-nccl, one is from PyPI, another one from
+    # nvidia-pyindex. We support only the first one as the second one is too old (2.9.8
+    # as of writing).
+    #
+    # nccl 2.28 doesn't have the __file__ attribute, we use the namespace path instead.
+    if lib.__file__ is not None:
+        dirname: Optional[str] = os.path.dirname(lib.__file__)
+    elif hasattr(lib, "__path__") and len(lib.__path__) > 0:
+        dirname = lib.__path__[0]
+    else:
+        dirname = None
+    if not dirname:
+        return None
+
+    # Find the first shared object in the lib directory.
+    files = os.listdir(dirname)
+    if not files:
+        return None
+
+    libname: Optional[str] = None
+    for name in files:
+        if name.startswith("libnccl.so"):
+            libname = name
+            break
+
+    if libname is not None:
+        path = os.path.join(dirname, libname)
+        return path
+    return None
+
+
 class CommunicatorContext:
     """A context controlling collective communicator initialization and finalization."""
 
@@ -309,18 +343,8 @@ def __init__(self, **args: _ArgVals) -> None:
 
         try:
             # PyPI package of NCCL.
-            from nvidia.nccl import lib
-
-            # There are two versions of nvidia-nccl, one is from PyPI, another one from
-            # nvidia-pyindex. We support only the first one as the second one is too old
-            # (2.9.8 as of writing).
-            if lib.__file__ is not None:
-                dirname: Optional[str] = os.path.dirname(lib.__file__)
-            else:
-                dirname = None
-
-            if dirname:
-                path = os.path.join(dirname, "libnccl.so.2")
+            path = _find_nccl()
+            if path:
                 self.args[key] = path
         except ImportError:
             pass

From e166942facd69268cfe94c0f4f556961280941f7 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Sun, 16 Nov 2025 19:58:53 +0100
Subject: [PATCH 224/224] [R] error out on factors passed to DMatrix (#11810)

---
 R-package/R/xgb.DMatrix.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 424b93f294dc..4e44ee408676 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -1063,6 +1063,9 @@ setinfo.xgb.DMatrix <- function(object, name, info) {
   if (name == "label") {
     if (NROW(info) != nrow(object))
       stop("The length of labels must equal to the number of rows in the input data")
+    if (is.factor(info)) {
+      stop("'label' must be a numeric variable.")
+    }
     .Call(XGDMatrixSetInfo_R, object, name, info)
     return(TRUE)
   }