Added statistics lines to the violinplot function.

solvents · solvents · commit b9d7e036a14f · 2014-05-24T16:47:57.000-04:00
diff --git a/lib/matplotlib/axes/_axes.py b/lib/matplotlib/axes/_axes.py
@@ -6725,7 +6725,8 @@ def matshow(self, Z, **kwargs):
                                                  integer=True))
         return im
 
-    def violinplot(self, dataset, positions=None, width=0.5):
+    def violinplot(self, dataset, positions=None, widths=0.5, showmeans=False,
+                   showextrema=True, showmedians=False):
         """
         Make a violin plot.
 
@@ -6748,11 +6749,20 @@ def violinplot(self, dataset, positions=None, width=0.5):
             Sets the positions of the violins. The ticks and limits are
             automatically set to match the positions.
 
-          width : array-like, default = 0.5
+          widths : array-like, default = 0.5
             Either a scalar or a vector that sets the maximal width of
             each violin. The default is 0.5, which uses about half of the
             available horizontal space.
 
+          showmeans : bool, default = False
+            If true, will toggle rendering of the means.
+
+          showextrema : bool, default = True
+            If true, will toggle rendering of the extrema.
+
+          showmedians : bool, default = False
+            If true, will toggle rendering of the medians.
+
         Returns
         -------
 
@@ -6763,24 +6773,58 @@ def violinplot(self, dataset, positions=None, width=0.5):
             - bodies: A list of the 
               :class:`matplotlib.collections.PolyCollection` instances
               containing the filled area of each violin.
-            - means: A list of the :class:`matplotlib.lines.Line2D` instances
-              created to identify the mean values for each of the violins.
-            - caps: A list of the :class:`matplotlib.lines.Line2D` instances
-              created to identify the extremal values of each violin's
-              data set.
+            - means: A :class:`matplotlib.collections.LineCollection` instance
+              created to identify the mean values of each of the violin's
+              distribution.
+            - mins: A :class:`matplotlib.collections.LineCollection` instance
+              created to identify the bottom of each violin's distribution.
+            - maxes: A :class:`matplotlib.collections.LineCollection` instance
+              created to identify the top of each violin's distribution.
+            - bars: A :class:`matplotlib.collections.LineCollection` instance
+              created to identify the centers of each violin's distribution.
+            - medians: A :class:`matplotlib.collections.LineCollection` instance
+              created to identify the median values of each of the violin's
+              distribution.
 
         """
 
-        bodies = []
+        # Statistical quantities to be plotted on the violins
         means = []
-        caps = []
+        mins = []
+        maxes = []
+        medians = []
+
+        # Collections to be returned
+        bodies = []
+        cmeans = None
+        cmaxes = None
+        cmins = None
+        cbars = None
+        cmedians = None
 
+        # Validate positions
         if positions == None:
             positions = range(1, len(dataset) + 1)
         elif len(positions) != len(dataset):
             raise ValueError(datashape_message.format("positions"))
 
-        for d,p in zip(dataset,positions):            
+        # Validate widths
+        if np.isscalar(widths):
+            widths = [widths] * len(dataset)
+        elif len(widths) != len(dataset):
+            raise ValueError(datashape_message.format("widths"))
+
+        # Calculate mins and maxes for statistics lines
+        pmins = -0.25 * np.array(widths) + positions
+        pmaxes = 0.25 * np.array(widths) + positions
+
+        # Check hold status
+        if not self._hold:
+            self.cla()
+        holdStatus = self._hold
+
+        # Render violins
+        for d,p,w in zip(dataset,positions,widths):            
             # Calculate the kernel density
             kde = mlab.ksdensity(d)
             m = kde['xmin']
@@ -6793,18 +6837,43 @@ def violinplot(self, dataset, positions=None, width=0.5):
             # Since each data point p is plotted from v-p to v+p,
             # we need to scale it by an additional 0.5 factor so that we get
             # correct width in the end.
-            v = 0.5 * width * v/v.max()
+            v = 0.5 * w * v/v.max()
 
             bodies += [self.fill_betweenx(coords,
                                           -v+p,
                                           v+p,
                                           facecolor='y',
                                           alpha=0.3)]
 
+            means.append(mean)
+            mins.append(m)
+            maxes.append(M)
+            medians.append(median)
+
+        # Render means
+        if showmeans:
+            cmeans = self.hlines(means, pmins, pmaxes, colors='r')
+
+        # Render extrema
+        if showextrema:
+            cmaxes = self.hlines(maxes, pmins, pmaxes, colors='r')
+            cmins = self.hlines(mins, pmins, pmaxes, colors='r')
+            cbars = self.vlines(positions, mins, maxes, colors='r')
+
+        # Render medians
+        if showmedians:
+            cmedians = self.hlines(medians, pmins, pmaxes, colors='r')
+
+        # Reset hold
+        self.hold(holdStatus)
+
         return {
             'bodies' : bodies,
-            'means' : means,
-            'caps' : caps
+            'means' : cmeans,
+            'mins' : cmins,
+            'maxes' : cmaxes,
+            'bars' : cbars,
+            'medians' : cmedians
         }
 
 
diff --git a/lib/matplotlib/mlab.py b/lib/matplotlib/mlab.py
@@ -3661,7 +3661,7 @@ def ksdensity(dataset, bw_method=None):
     Representation of a kernel-density estimate using Gaussian kernels.
 
     Call signature::
-    xmin, xmax, result = ksdensity(dataset, 'scott')
+    kde_dict = ksdensity(dataset, 'silverman')
 
     Parameters
     ----------
@@ -3714,22 +3714,22 @@ def ksdensity(dataset, bw_method=None):
     """
 
     # This implementation with minor modification was too good to pass up.
-    # from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py 
+    # from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
 
-    dataset = np.atleast_2d(dataset)
+    dataset = np.array(np.atleast_2d(dataset))
     xmin = dataset.min()
     xmax = dataset.max()
 
     if not dataset.size > 1:
         raise ValueError("`dataset` input should have multiple elements.")
 
-    d, n = dataset.shape
+    dim, num_dp = dataset.shape
 
     # ----------------------------------------------
     # Set Bandwidth, defaulted to Scott's Factor
     # ----------------------------------------------
-    scotts_factor = lambda: np.power(n, -1./(d+4))
-    silverman_factor = lambda: np.power(n*(d+2.0)/4.0, -1./(d+4))
+    scotts_factor = lambda: np.power(num_dp, -1./(dim+4))
+    silverman_factor = lambda: np.power(num_dp*(dim+2.0)/4.0, -1./(dim+4))
 
     # Default method to calculate bandwidth, can be overwritten by subclass
     covariance_factor = scotts_factor
@@ -3740,7 +3740,7 @@ def ksdensity(dataset, bw_method=None):
         covariance_factor = scotts_factor
     elif bw_method == 'silverman':
         covariance_factor = silverman_factor
-    elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
+    elif np.isscalar(bw_method) and not isinstance(bw_method, six.string_types):
         covariance_factor = lambda: bw_method
     else:
         msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
@@ -3752,53 +3752,54 @@ def ksdensity(dataset, bw_method=None):
     factor = covariance_factor()
 
     # Cache covariance and inverse covariance of the data
-    data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1,bias=False))
+    data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1, bias=False))
     data_inv_cov = np.linalg.inv(data_covariance)
 
     covariance = data_covariance * factor**2
     inv_cov = data_inv_cov / factor**2
-    norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * n
+    norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * num_dp
 
     # ----------------------------------------------
     # Evaluate the estimated pdf on a set of points.
     # ----------------------------------------------
-    points = np.atleast_2d(np.arange(xmin,xmax, (xmax-xmin)/100.))
+    points = np.atleast_2d(np.arange(xmin, xmax, (xmax-xmin)/100.))
 
-    d1, m1 = points.shape
-    if d1 != d:
-        if d1 == 1 and m1 == d:
+    dim_pts, num_dp_pts = np.array(points).shape
+    if dim_pts != dim:
+        if dim_pts == 1 and num_dp_pts == num_dp:
             # points was passed in as a row vector
-            points = np.reshape(points, (d, 1))
-            m1 = 1
+            points = np.reshape(points, (dim, 1))
+            num_dp_pts = 1
         else:
-            msg = "points have dimension %s, dataset has dimension %s" % (d1, d)
+            msg = "points have dimension %s,\
+                   dataset has dimension %s" % (dim_pts, dim)
             raise ValueError(msg)
 
-    result = np.zeros((m1,), dtype=np.float)
+    result = np.zeros((num_dp_pts,), dtype=np.float)
 
-    if m1 >= n:
+    if num_dp_pts >= num_dp:
         # there are more points than data, so loop over data
-        for i in range(n):
+        for i in range(num_dp):
             diff = dataset[:, i, np.newaxis] - points
             tdiff = np.dot(inv_cov, diff)
-            energy = np.sum(diff*tdiff,axis=0) / 2.0
+            energy = np.sum(diff*tdiff, axis=0) / 2.0
             result = result + np.exp(-energy)
     else:
         # loop over points
-        for i in range(m):
-            diff = dataset - points[:, i, newaxis]
+        for i in range(num_dp_pts):
+            diff = dataset - points[:, i, np.newaxis]
             tdiff = np.dot(inv_cov, diff)
             energy = np.sum(diff * tdiff, axis=0) / 2.0
             result[i] = np.sum(np.exp(-energy), axis=0)
 
     result = result / norm_factor
 
     return {
-        'xmin' : xmin,
-        'xmax' : xmax,
-        'mean' : np.mean(result),
-        'median' : np.median(result),
-        'result' : result
+        'xmin': xmin,
+        'xmax': xmax,
+        'mean': np.mean(dataset),
+        'median': np.median(dataset),
+        'result': result
     }
 
 ##################################################