From 8e7efa53bd34f09aef25de07aeb11507c0b16971 Mon Sep 17 00:00:00 2001 From: Jan Richter Date: Sat, 13 Jun 2015 11:23:39 +0200 Subject: [PATCH] added 'tukey' Box plot mode and outliers plotting Tukey box plot (introduced by John W. Tukey) takes for whiskers the lowest datum still within 1.5 IQR of the lower quartile, and the highest datum still within 1.5 IQR of the upper quartile. (https://en.wikipedia.org/wiki/Box_plot) --- pygal/config.py | 2 +- pygal/graph/box.py | 66 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/pygal/config.py b/pygal/config.py index a89239c..733afa6 100644 --- a/pygal/config.py +++ b/pygal/config.py @@ -344,7 +344,7 @@ class Config(CommonConfig): mode = Key( None, str, "Value", "Sets the mode to be used. " "(Currently only supported on box plot)", - "May be %s" % ' or '.join(["1.5IQR", "extremes"])) + "May be %s" % ' or '.join(["1.5IQR", "extremes", "tukey"])) order_min = Key( None, int, "Value", "Minimum order of scale, defaults to None") diff --git a/pygal/graph/box.py b/pygal/graph/box.py index fa29bdc..1c47834 100644 --- a/pygal/graph/box.py +++ b/pygal/graph/box.py @@ -24,6 +24,7 @@ from __future__ import division from pygal.graph.graph import Graph from pygal.util import compute_scale, decorate from pygal._compat import is_list_like +from bisect import bisect_left, bisect_right class Box(Graph): @@ -48,9 +49,13 @@ class Box(Graph): def format_maybe_quartile(x): if is_list_like(x): if self.mode == "extremes": - return 'Min: %s Q1: %s Q2: %s Q3: %s Max: %s' % tuple(map(sup, x)) + return 'Min: %s Q1: %s Q2: %s Q3: %s Max: %s' \ + % tuple(map(sup, x[1:6])) + elif self.mode == "tukey": + return 'Min: %s Lower Whisker: %s Q1: %s Q2: %s Q3: %s '\ + 'Upper Whisker: %s Max: %s' % tuple(map(sup, x)) else: - return 'Q1: %s Q2: %s Q3: %s' % tuple(map(sup, x[1:4])) + return 'Q1: %s Q2: %s Q3: %s' % tuple(map(sup, x[2:5])) else: return sup(x) return format_maybe_quartile @@ -61,7 +66,9 @@ class Box(Graph): within the rendering process """ for serie in self.series: - serie.values = self._box_points(serie.values, self.mode) + serie.values, serie.outliers = \ + self._box_points(serie.values, self.mode) + if self._min: self._box.ymin = min(self._min, self.zero) @@ -92,7 +99,7 @@ class Box(Graph): @property def _len(self): """Len is always 5 here""" - return 5 + return 7 def _boxf(self, serie): """ @@ -112,11 +119,12 @@ class Box(Graph): metadata) val = self._format(serie.values) - x_center, y_center = self._draw_box(box, serie.values, serie.index) + x_center, y_center = self._draw_box(box, serie.values[1:6], + serie.outliers, serie.index) self._tooltip_data(box, val, x_center, y_center, classes="centered") self._static_value(serie_node, val, x_center, y_center) - def _draw_box(self, parent_node, quartiles, box_index): + def _draw_box(self, parent_node, quartiles, outliers, box_index): """ Return the center of a bounding box defined by a box plot. Draws a box plot on self.svg. @@ -164,6 +172,17 @@ class Box(Graph): width=width, class_='subtle-fill reactive tooltip-trigger') + # draw outliers + for o in outliers: + self.svg.node( + parent_node, + tag='circle', + cx=left_edge+width/2, + cy=self.view.y(o), + r=3, + class_='subtle-fill reactive tooltip-trigger') + + return (left_edge + width / 2, self.view.y( sum(quartiles) / len(quartiles))) @@ -171,11 +190,13 @@ class Box(Graph): def _box_points(values, mode='1.5IQR'): """ Default mode: (mode='1.5IQR' or unset) - Return a 5-tuple of Q1 - 1.5 * IQR, Q1, Median, Q3, - and Q3 + 1.5 * IQR for a list of numeric values. + Return a 7-tuple of min, Q1 - 1.5 * IQR, Q1, Median, Q3, + Q3 + 1.5 * IQR and max for a list of numeric values. Extremes mode: (mode='extremes') - Return a 5-tuple of minimum, Q1, Median, Q3, - and maximum for a list of numeric values. + Return a 7-tuple of 2x minimum, Q1, Median, Q3, + and 2x maximum for a list of numeric values. + Outliers (Tukey) mode: (mode='tukey') + Return a 7-tuple of min, q[0..4], max and a list of outliers The iterator values may include None values. @@ -192,10 +213,11 @@ class Box(Graph): return seq[n // 2] # sort the copy in case the originals must stay in original order + outliers = [] s = sorted([x for x in values if x is not None]) n = len(s) if not n: - return 0, 0, 0, 0, 0 + return (0, 0, 0, 0, 0, 0, 0), [] else: q2 = median(s) # See 'Method 3' in http://en.wikipedia.org/wiki/Quartile @@ -209,17 +231,31 @@ class Box(Graph): elif n % 4 == 1: # n is of form 4n + 1 where n >= 1 m = (n - 1) // 4 q1 = 0.25 * s[m-1] + 0.75 * s[m] - q3 = 0.75 * s[3*m] + 0.25 * s[3*m + 1] + q3 = 0.75 * s[3*m] + 0.25 * s[3*m+1] else: # n is of form 4n + 3 where n >= 1 m = (n - 3) // 4 q1 = 0.75 * s[m] + 0.25 * s[m+1] q3 = 0.25 * s[3*m+1] + 0.75 * s[3*m+2] iqr = q3 - q1 + min_s = s[0] + max_s = s[-1] if mode == 'extremes': - q0 = min(s) - q4 = max(s) + q0 = min_s + q4 = max_s + elif mode == 'tukey': + # the lowest datum still within 1.5 IQR of the lower quartile, + # and the highest datum still within 1.5 IQR of the upper + # quartile [Tukey box plot, Wikipedia ] + b0 = bisect_left(s, q1 - 1.5 * iqr) + b4 = bisect_right(s, q3 + 1.5 * iqr) + q0 = s[b0] + q4 = s[b4-1] + outliers = s[:b0] + s[b4:] + #print "Q: [%s,%s,%s,%s,%s] O: %s" \ + # % (q0, q1, q2, q3, q4, outliers) + else: q0 = q1 - 1.5 * iqr q4 = q3 + 1.5 * iqr - return q0, q1, q2, q3, q4 + return (min_s, q0, q1, q2, q3, q4, max_s), outliers