abhinav-upadhyay
diff --git a/‎doc/modules/compose.rst‎
Lines changed: 41 additions & 4 deletions b/‎doc/modules/compose.rst‎
Lines changed: 41 additions & 4 deletions
diff --git a/‎sklearn/compose/_column_transformer.py‎
Lines changed: 66 additions & 59 deletions b/‎sklearn/compose/_column_transformer.py‎
Lines changed: 66 additions & 59 deletions
@@ -404,22 +404,26 @@ preprocessing or a specific feature extraction method::
   >>> X = pd.DataFrame(
   ...     {'city': ['London', 'London', 'Paris', 'Sallisaw'],
   ...      'title': ["His Last Bow", "How Watson Learned the Trick",
-  ...                "A Moveable Feast", "The Grapes of Wrath"]})
+  ...                "A Moveable Feast", "The Grapes of Wrath"],
+  ...      'expert_rating': [5, 3, 4, 5],
+  ...      'user_rating': [4, 5, 4, 3]})
 
 For this data, we might want to encode the ``'city'`` column as a categorical
 variable, but apply a :class:`feature_extraction.text.CountVectorizer
 <sklearn.feature_extraction.text.CountVectorizer>` to the ``'title'`` column.
 As we might use multiple feature extraction methods on the same column, we give
-each transformer a unique name, say ``'city_category'`` and ``'title_bow'``::
+each transformer a unique name, say ``'city_category'`` and ``'title_bow'``.
+We can ignore the remaining rating columns by setting ``remainder='drop'``::
 
   >>> from sklearn.compose import ColumnTransformer
   >>> from sklearn.feature_extraction.text import CountVectorizer
   >>> column_trans = ColumnTransformer(
   ...     [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
-  ...      ('title_bow', CountVectorizer(), 'title')])
+  ...      ('title_bow', CountVectorizer(), 'title')],
+  ...      remainder='drop')
 
   >>> column_trans.fit(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-  ColumnTransformer(n_jobs=1, remainder='passthrough', transformer_weights=None,
+  ColumnTransformer(n_jobs=1, remainder='drop', transformer_weights=None,
       transformers=...)
 
   >>> column_trans.get_feature_names()
@@ -448,6 +452,39 @@ as a list of multiple items, an integer array, a slice, or a boolean mask.
 Strings can reference columns if the input is a DataFrame, integers are always
 interpreted as the positional columns.
 
+We can keep the remaining rating columns by setting
+``remainder='passthrough'``. The values are appended to the end of the
+transformation::
+
+  >>> column_trans = ColumnTransformer(
+  ...     [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
+  ...      ('title_bow', CountVectorizer(), 'title')],
+  ...      remainder='passthrough')
+
+  >>> column_trans.fit_transform(X).toarray()
+  ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 4],
+         [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 3, 5],
+         [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 4],
+         [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 5, 3]]...)
+
+The ``remainder`` parameter can be set to an estimator to transform the
+remaining rating columns. The transformed values are appended to the end of
+the transformation::
+
+  >>> from sklearn.preprocessing import MinMaxScaler
+  >>> column_trans = ColumnTransformer(
+  ...     [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
+  ...      ('title_bow', CountVectorizer(), 'title')],
+  ...      remainder=MinMaxScaler())
+
+  >>> column_trans.fit_transform(X)[:, -2:].toarray()
+  ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  array([[1. , 0.5],
+         [0. , 1. ],
+         [0.5, 0.5],
+         [1. , 0. ]])
+
 The :func:`~sklearn.compose.make_columntransformer` function is available
 to more easily create a :class:`~sklearn.compose.ColumnTransformer` object.
 Specifically, the names will be given automatically. The equivalent for the
 
@@ -6,7 +6,7 @@
 # Author: Andreas Mueller
 #         Joris Van den Bossche
 # License: BSD
-
+from itertools import chain
 
 import numpy as np
 from scipy import sparse
@@ -69,14 +69,17 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
             ``transformer`` expects X to be a 1d array-like (vector),
             otherwise a 2d array will be passed to the transformer.
 
-    remainder : {'passthrough', 'drop'}, default 'passthrough'
+    remainder : {'passthrough', 'drop'} or estimator, default 'passthrough'
         By default, all remaining columns that were not specified in
         `transformers` will be automatically passed through (default of
         ``'passthrough'``). This subset of columns is concatenated with the
         output of the transformers.
         By using ``remainder='drop'``, only the specified columns in
         `transformers` are transformed and combined in the output, and the
         non-specified columns are dropped.
+        By setting ``remainder`` to be an estimator, the remaining
+        non-specified columns will use the ``remainder`` estimator. The
+        estimator must support `fit` and `transform`.
 
     n_jobs : int, optional
         Number of jobs to run in parallel (default 1).
@@ -90,7 +93,13 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
     ----------
     transformers_ : list
         The collection of fitted transformers as tuples of
-        (name, fitted_transformer, column).
+        (name, fitted_transformer, column). `fitted_transformer` can be an
+        estimator, 'drop', or 'passthrough'. If there are remaining columns,
+        the final element is a tuple of the form:
+        ('remainder', transformer, remaining_columns) corresponding to the
+        ``remainder`` parameter. If there are remaining columns, then
+        ``len(transformers_)==len(transformers)+1``, otherwise
+        ``len(transformers_)==len(transformers)``.
 
     named_transformers_ : Bunch object, a dictionary with attribute access
         Read-only attribute to access any transformer by given name.
@@ -188,13 +197,12 @@ def _iter(self, X=None, fitted=False, replace_strings=False):
             transformers = self.transformers_
         else:
             transformers = self.transformers
+            if self._remainder[2] is not None:
+                transformers = chain(transformers, [self._remainder])
         get_weight = (self.transformer_weights or {}).get
 
         for name, trans, column in transformers:
-            if X is None:
-                sub = X
-            else:
-                sub = _get_column(X, column)
+            sub = None if X is None else _get_column(X, column)
 
             if replace_strings:
                 # replace 'passthrough' with identity transformer and
@@ -209,7 +217,10 @@ def _iter(self, X=None, fitted=False, replace_strings=False):
             yield (name, trans, sub, get_weight(name))
 
     def _validate_transformers(self):
-        names, transformers, _, _ = zip(*self._iter())
+        if not self.transformers:
+            return
+
+        names, transformers, _ = zip(*self.transformers)
 
         # validate names
         self._validate_names(names)
@@ -226,24 +237,27 @@ def _validate_transformers(self):
                                 (t, type(t)))
 
     def _validate_remainder(self, X):
-        """Generate list of passthrough columns for 'remainder' case."""
-        if self.remainder not in ('drop', 'passthrough'):
+        """
+        Validates ``remainder`` and defines ``_remainder`` targeting
+        the remaining columns.
+        """
+        is_transformer = ((hasattr(self.remainder, "fit")
+                           or hasattr(self.remainder, "fit_transform"))
+                          and hasattr(self.remainder, "transform"))
+        if (self.remainder not in ('drop', 'passthrough')
+                and not is_transformer):
             raise ValueError(
-                "The remainder keyword needs to be one of 'drop' or "
-                "'passthrough'. {0:r} was passed instead")
+                "The remainder keyword needs to be one of 'drop', "
+                "'passthrough', or estimator. '%s' was passed instead" %
+                self.remainder)
 
         n_columns = X.shape[1]
+        cols = []
+        for _, _, columns in self.transformers:
+            cols.extend(_get_column_indices(X, columns))
+        remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None
 
-        if self.remainder == 'passthrough':
-            cols = []
-            for _, _, columns in self.transformers:
-                cols.extend(_get_column_indices(X, columns))
-            self._passthrough = sorted(list(set(range(n_columns)) - set(cols)))
-            if not self._passthrough:
-                # empty list -> no need to select passthrough columns
-                self._passthrough = None
-        else:
-            self._passthrough = None
+        self._remainder = ('remainder', self.remainder, remaining_idx)
 
     @property
     def named_transformers_(self):
@@ -267,12 +281,6 @@ def get_feature_names(self):
             Names of the features produced by transform.
         """
         check_is_fitted(self, 'transformers_')
-        if self._passthrough is not None:
-            raise NotImplementedError(
-                "get_feature_names is not yet supported when having columns"
-                "that are passed through (you specify remainder='drop' to not "
-                "pass through the unspecified columns).")
-
         feature_names = []
         for name, trans, _, _ in self._iter(fitted=True):
             if trans == 'drop':
@@ -294,7 +302,11 @@ def _update_fitted_transformers(self, transformers):
         transformers = iter(transformers)
         transformers_ = []
 
-        for name, old, column in self.transformers:
+        transformer_iter = self.transformers
+        if self._remainder[2] is not None:
+            transformer_iter = chain(transformer_iter, [self._remainder])
+
+        for name, old, column in transformer_iter:
             if old == 'drop':
                 trans = 'drop'
             elif old == 'passthrough':
@@ -304,7 +316,6 @@ def _update_fitted_transformers(self, transformers):
                 trans = 'passthrough'
             else:
                 trans = next(transformers)
-
             transformers_.append((name, trans, column))
 
         # sanity check that transformers is exhausted
@@ -335,7 +346,7 @@ def _fit_transform(self, X, y, func, fitted=False):
             return Parallel(n_jobs=self.n_jobs)(
                 delayed(func)(clone(trans) if not fitted else trans,
                               X_sel, y, weight)
-                for name, trans, X_sel, weight in self._iter(
+                for _, trans, X_sel, weight in self._iter(
                     X=X, fitted=fitted, replace_strings=True))
         except ValueError as e:
             if "Expected 2D array, got 1D array instead" in str(e):
@@ -361,12 +372,12 @@ def fit(self, X, y=None):
             This estimator
 
         """
-        self._validate_transformers()
         self._validate_remainder(X)
+        self._validate_transformers()
 
         transformers = self._fit_transform(X, y, _fit_one_transformer)
-
         self._update_fitted_transformers(transformers)
+
         return self
 
     def fit_transform(self, X, y=None):
@@ -390,31 +401,21 @@ def fit_transform(self, X, y=None):
             sparse matrices.
 
         """
-        self._validate_transformers()
         self._validate_remainder(X)
+        self._validate_transformers()
 
         result = self._fit_transform(X, y, _fit_transform_one)
 
         if not result:
             # All transformers are None
-            if self._passthrough is None:
-                return np.zeros((X.shape[0], 0))
-            else:
-                return _get_column(X, self._passthrough)
+            return np.zeros((X.shape[0], 0))
 
         Xs, transformers = zip(*result)
 
         self._update_fitted_transformers(transformers)
         self._validate_output(Xs)
 
-        if self._passthrough is not None:
-            Xs = list(Xs) + [_get_column(X, self._passthrough)]
-
-        if any(sparse.issparse(f) for f in Xs):
-            Xs = sparse.hstack(Xs).tocsr()
-        else:
-            Xs = np.hstack(Xs)
-        return Xs
+        return _hstack(list(Xs))
 
     def transform(self, X):
         """Transform X separately by each transformer, concatenate results.
@@ -440,19 +441,9 @@ def transform(self, X):
 
         if not Xs:
             # All transformers are None
-            if self._passthrough is None:
-                return np.zeros((X.shape[0], 0))
-            else:
-                return _get_column(X, self._passthrough)
-
-        if self._passthrough is not None:
-            Xs = list(Xs) + [_get_column(X, self._passthrough)]
+            return np.zeros((X.shape[0], 0))
 
-        if any(sparse.issparse(f) for f in Xs):
-            Xs = sparse.hstack(Xs).tocsr()
-        else:
-            Xs = np.hstack(Xs)
-        return Xs
+        return _hstack(list(Xs))
 
 
 def _check_key_type(key, superclass):
@@ -486,6 +477,19 @@ def _check_key_type(key, superclass):
     return False
 
 
+def _hstack(X):
+    """
+    Stacks X horizontally.
+
+    Supports input types (X): list of
+        numpy arrays, sparse arrays and DataFrames
+    """
+    if any(sparse.issparse(f) for f in X):
+        return sparse.hstack(X).tocsr()
+    else:
+        return np.hstack(X)
+
+
 def _get_column(X, key):
     """
     Get feature column(s) from input data X.
@@ -612,14 +616,17 @@ def make_column_transformer(*transformers, **kwargs):
     ----------
     *transformers : tuples of column selections and transformers
 
-    remainder : {'passthrough', 'drop'}, default 'passthrough'
+    remainder : {'passthrough', 'drop'} or estimator, default 'passthrough'
         By default, all remaining columns that were not specified in
         `transformers` will be automatically passed through (default of
         ``'passthrough'``). This subset of columns is concatenated with the
         output of the transformers.
         By using ``remainder='drop'``, only the specified columns in
         `transformers` are transformed and combined in the output, and the
         non-specified columns are dropped.
+        By setting ``remainder`` to be an estimator, the remaining
+        non-specified columns will use the ``remainder`` estimator. The
+        estimator must support `fit` and `transform`.
 
     n_jobs : int, optional
         Number of jobs to run in parallel (default 1).