Implemented median and constant strategies in DummyRegressor

maheshakya · maheshakya · commit d41fb761de52 · 2014-03-10T18:14:53.000+05:30
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -1,6 +1,6 @@
-
 # Author: Mathieu Blondel <mathieu@mblondel.org>
 #         Arnaud Joly <a.joly@ulg.ac.be>
+#         Maheshakya Wijewardena<maheshakya.10@cse.mrt.ac.lk>
 # License: BSD 3 clause
 
 import numpy as np
@@ -13,6 +13,7 @@
 
 
 class DummyClassifier(BaseEstimator, ClassifierMixin):
+
     """
     DummyClassifier is a classifier that makes predictions using simple rules.
 
@@ -273,6 +274,7 @@ def predict_log_proba(self, X):
 
 
 class DummyRegressor(BaseEstimator, RegressorMixin):
+
     """
     DummyRegressor is a regressor that always predicts the mean of the training
     targets.
@@ -282,8 +284,9 @@ class DummyRegressor(BaseEstimator, RegressorMixin):
 
     Attributes
     ----------
-    `y_mean_` : float or array of shape [n_outputs]
-        Mean of the training targets.
+    `constant_' : float or array of shape [n_outputs]
+        Mean or median of the training targets or constant value given the by
+        the user.
 
     `n_outputs_` : int,
         Number of outputs.
@@ -292,6 +295,10 @@ class DummyRegressor(BaseEstimator, RegressorMixin):
         True if the output at fit is 2d, else false.
     """
 
+    def __init__(self, strategy="mean", constant=None):
+        self.strategy = strategy
+        self.constant = constant
+
     def fit(self, X, y):
         """Fit the random regressor.
 
@@ -309,11 +316,47 @@ def fit(self, X, y):
         self : object
             Returns self.
         """
+
+        if self.strategy not in ("mean", "median", "constant"):
+            raise ValueError("Unknown strategy type.")
+
         y = safe_asarray(y)
-        self.y_mean_ = np.reshape(np.mean(y, axis=0), (1, -1))
-        self.n_outputs_ = np.size(self.y_mean_)  # y.shape[1] is not safe
-        self.output_2d_ = (y.ndim == 2)
-        return self
+
+        if self.strategy == "mean":
+            self.constant_ = np.reshape(np.mean(y, axis=0), (1, -1))
+            self.n_outputs_ = np.size(self.constant_)  # y.shape[1] is not safe
+            self.output_2d_ = (y.ndim == 2)
+            return self
+
+        elif self.strategy == "median":
+            self.constant_ = np.reshape(np.median(y, axis=0), (1, -1))
+            self.n_outputs_ = np.size(self.constant_)  # y.shape[1] is not safe
+            self.output_2d_ = (y.ndim == 2)
+            return self
+
+        elif self.strategy == "constant":
+            if self.constant is None:
+                raise ValueError("Constant not defined.")
+
+            if not (isinstance(self.constant, np.ndarray) or isinstance(self.constant, list)):
+                raise ValueError(
+                    "Constants should be in type list or numpy.ndarray.")
+
+            self.output_2d_ = (y.ndim == 2)
+            self.constant = safe_asarray(self.constant)
+
+            if self.output_2d_:
+                if self.constant.shape[1] != y.shape[1]:
+                    raise ValueError(
+                        "Number of outputs and number of constants do not match.")
+            else:
+                if len(self.constant) != 1:
+                    raise ValueError(
+                        "Number of constants should be equal to one.")
+
+            self.constant_ = np.reshape(self.constant, (1, -1))
+            self.n_outputs_ = np.size(self.constant_)  # y.shape[1] is not safe
+            return self
 
     def predict(self, X):
         """
@@ -330,14 +373,16 @@ def predict(self, X):
         y : array, shape = [n_samples]  or [n_samples, n_outputs]
             Predicted target values for X.
         """
-        if not hasattr(self, "y_mean_"):
+        if not hasattr(self, "constant_"):
             raise ValueError("DummyRegressor not fitted.")
 
         X = safe_asarray(X)
         n_samples = X.shape[0]
-        y = np.ones((n_samples, 1)) * self.y_mean_
+
+        y = np.ones((n_samples, 1)) * self.constant_
 
         if self.n_outputs_ == 1 and not self.output_2d_:
             y = np.ravel(y)
 
         return y
+
diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
@@ -1,3 +1,8 @@
+sklearn/dummy.py:290:1: W293 blank line contains whitespace
+sklearn/dummy.py:341:80: E501 line too long (94 > 79 characters)
+sklearn/dummy.py:351:80: E501 line too long (82 > 79 characters)
+sklearn/dummy.py:388:1: W391 blank line at end of file
+maheshakya@maheshakya-TECRA-M11:~/scikit-learn$ autopep8 sklearn/tests/test_dummy.py 
 import warnings
 import numpy as np
 
@@ -59,6 +64,19 @@ def _check_behavior_2d(clf):
     assert_equal(y.shape, y_pred.shape)
 
 
+def _check_behavior_2d_for_constant(clf):
+    # 2d case only
+    X = np.array([[0], [0], [0], [0]])  # ignored
+    y = np.array([[1, 0, 5, 4, 3],
+                  [2, 0, 1, 2, 5],
+                  [1, 0, 4, 5, 2],
+                  [1, 3, 3, 2, 0]])
+    est = clone(clf)
+    est.fit(X, y)
+    y_pred = est.predict(X)
+    assert_equal(y.shape, y_pred.shape)
+
+
 def test_most_frequent_strategy():
     X = [[0], [0], [0], [0]]  # ignored
     y = [1, 2, 1, 1]
@@ -175,7 +193,7 @@ def test_classifier_exceptions():
     assert_raises(ValueError, clf.predict_proba, [])
 
 
-def test_regressor():
+def test_mean_strategy_regressor():
     X = [[0]] * 4  # ignored
     y = [1, 2, 1, 1]
 
@@ -184,7 +202,7 @@ def test_regressor():
     assert_array_equal(reg.predict(X), [5. / 4] * len(X))
 
 
-def test_multioutput_regressor():
+def test_mean_strategy_multioutput_regressor():
 
     X_learn = np.random.randn(10, 10)
     y_learn = np.random.randn(10, 5)
@@ -210,6 +228,66 @@ def test_regressor_exceptions():
     assert_raises(ValueError, reg.predict, [])
 
 
+def test_median_strategy_regressor():
+    X = [[0]] * 5  # ignored
+    y = [1, 2, 4, 6, 8]
+
+    reg = DummyRegressor(strategy="median")
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [4] * len(X))
+
+
+def test_median_strategy_multioutput_regressor():
+
+    X_learn = np.random.randn(10, 10)
+    y_learn = np.random.randn(10, 5)
+
+    median = np.median(y_learn, axis=0).reshape((1, -1))
+
+    X_test = np.random.randn(20, 10)
+    y_test = np.random.randn(20, 5)
+
+    # Correctness oracle
+    est = DummyRegressor(strategy="median")
+    est.fit(X_learn, y_learn)
+    y_pred_learn = est.predict(X_learn)
+    y_pred_test = est.predict(X_test)
+
+    assert_array_equal(np.tile(median, (y_learn.shape[0], 1)), y_pred_learn)
+    assert_array_equal(np.tile(median, (y_test.shape[0], 1)), y_pred_test)
+    _check_behavior_2d(est)
+
+
+def test_constant_strategy_regressor():
+    X = [[0]] * 5  # ignored
+    y = [1, 2, 4, 6, 8]
+
+    reg = DummyRegressor(strategy="constant", constant=[43])
+    reg.fit(X, y)
+    assert_array_equal(reg.predict(X), [43] * len(X))
+
+
+def test_constant_strategy_multioutput_regressor():
+
+    X_learn = np.random.randn(10, 10)
+    y_learn = np.random.randn(10, 5)
+
+    constants = np.random.randn(1, 5)
+
+    X_test = np.random.randn(20, 10)
+    y_test = np.random.randn(20, 5)
+
+    # Correctness oracle
+    est = DummyRegressor(strategy="constant", constant=constants)
+    est.fit(X_learn, y_learn)
+    y_pred_learn = est.predict(X_learn)
+    y_pred_test = est.predict(X_test)
+
+    assert_array_equal(np.tile(constants, (y_learn.shape[0], 1)), y_pred_learn)
+    assert_array_equal(np.tile(constants, (y_test.shape[0], 1)), y_pred_test)
+    _check_behavior_2d_for_constant(est)
+
+
 def test_constant_strategy():
     X = [[0], [0], [0], [0]]  # ignored
     y = [2, 1, 2, 2]
@@ -253,3 +331,4 @@ def test_constant_strategy_exceptions():
     clf = DummyClassifier(strategy="constant", random_state=0,
                           constant=[2, 0])
     assert_raises(ValueError, clf.fit, X, y)
+