diff --git a/pygal/config.py b/pygal/config.py index a89239c..3e4122a 100644 --- a/pygal/config.py +++ b/pygal/config.py @@ -344,7 +344,8 @@ class Config(CommonConfig): mode = Key( None, str, "Value", "Sets the mode to be used. " "(Currently only supported on box plot)", - "May be %s" % ' or '.join(["1.5IQR", "extremes"])) + "May be %s" % ' or '.join(["1.5IQR", "extremes", "tukey", "stdev",\ + "pstdev"])) order_min = Key( None, int, "Value", "Minimum order of scale, defaults to None") diff --git a/pygal/graph/box.py b/pygal/graph/box.py index fa29bdc..29e34b5 100644 --- a/pygal/graph/box.py +++ b/pygal/graph/box.py @@ -24,6 +24,7 @@ from __future__ import division from pygal.graph.graph import Graph from pygal.util import compute_scale, decorate from pygal._compat import is_list_like +from bisect import bisect_left, bisect_right class Box(Graph): @@ -48,9 +49,14 @@ class Box(Graph): def format_maybe_quartile(x): if is_list_like(x): if self.mode == "extremes": - return 'Min: %s Q1: %s Q2: %s Q3: %s Max: %s' % tuple(map(sup, x)) + return 'Min: %s Q1: %s Q2: %s Q3: %s Max: %s' \ + % tuple(map(sup, x[1:6])) + elif self.mode in ["tukey", "stdev", "pstdev"]: + return 'Min: %s Lower Whisker: %s Q1: %s Q2: %s Q3: %s '\ + 'Upper Whisker: %s Max: %s' % tuple(map(sup, x)) else: - return 'Q1: %s Q2: %s Q3: %s' % tuple(map(sup, x[1:4])) + # 1.5IQR mode + return 'Q1: %s Q2: %s Q3: %s' % tuple(map(sup, x[2:5])) else: return sup(x) return format_maybe_quartile @@ -61,7 +67,9 @@ class Box(Graph): within the rendering process """ for serie in self.series: - serie.values = self._box_points(serie.values, self.mode) + serie.values, serie.outliers = \ + self._box_points(serie.values, self.mode) + if self._min: self._box.ymin = min(self._min, self.zero) @@ -91,8 +99,8 @@ class Box(Graph): @property def _len(self): - """Len is always 5 here""" - return 5 + """Len is always 7 here""" + return 7 def _boxf(self, serie): """ @@ -112,11 +120,12 @@ class Box(Graph): metadata) val = self._format(serie.values) - x_center, y_center = self._draw_box(box, serie.values, serie.index) + x_center, y_center = self._draw_box(box, serie.values[1:6], + serie.outliers, serie.index) self._tooltip_data(box, val, x_center, y_center, classes="centered") self._static_value(serie_node, val, x_center, y_center) - def _draw_box(self, parent_node, quartiles, box_index): + def _draw_box(self, parent_node, quartiles, outliers, box_index): """ Return the center of a bounding box defined by a box plot. Draws a box plot on self.svg. @@ -164,6 +173,17 @@ class Box(Graph): width=width, class_='subtle-fill reactive tooltip-trigger') + # draw outliers + for o in outliers: + self.svg.node( + parent_node, + tag='circle', + cx=left_edge+width/2, + cy=self.view.y(o), + r=3, + class_='subtle-fill reactive tooltip-trigger') + + return (left_edge + width / 2, self.view.y( sum(quartiles) / len(quartiles))) @@ -171,11 +191,20 @@ class Box(Graph): def _box_points(values, mode='1.5IQR'): """ Default mode: (mode='1.5IQR' or unset) - Return a 5-tuple of Q1 - 1.5 * IQR, Q1, Median, Q3, - and Q3 + 1.5 * IQR for a list of numeric values. + Return a 7-tuple of min, Q1 - 1.5 * IQR, Q1, Median, Q3, + Q3 + 1.5 * IQR and max for a list of numeric values. Extremes mode: (mode='extremes') - Return a 5-tuple of minimum, Q1, Median, Q3, - and maximum for a list of numeric values. + Return a 7-tuple of 2x minimum, Q1, Median, Q3, + and 2x maximum for a list of numeric values. + Tukey mode: (mode='tukey') + Return a 7-tuple of min, q[0..4], max and a list of outliers + Outliers are considered values x: x < q1 - IQR or x > q3 + IQR + SD mode: (mode='stdev') + Return a 7-tuple of min, q[0..4], max and a list of outliers + Outliers are considered values x: x < q2 - SD or x > q2 + SD + SDp mode: (mode='pstdev') + Return a 7-tuple of min, q[0..4], max and a list of outliers + Outliers are considered values x: x < q2 - SDp or x > q2 + SDp The iterator values may include None values. @@ -191,11 +220,29 @@ class Box(Graph): else: # seq has an odd length return seq[n // 2] + def mean(seq): + return sum(seq) /len(seq) + + def stdev(seq): + m = mean(seq) + l = len(seq) + v = sum((n - m)**2 for n in seq) / (l - 1) # variance + return v**0.5 # sqrt + + def pstdev(seq): + m = mean(seq) + l = len(seq) + v = sum((n - m)**2 for n in seq) / l # variance + return v**0.5 # sqrt + + outliers = [] # sort the copy in case the originals must stay in original order s = sorted([x for x in values if x is not None]) n = len(s) if not n: - return 0, 0, 0, 0, 0 + return (0, 0, 0, 0, 0, 0, 0), [] + elif n == 1: + return (s[0], s[0], s[0], s[0], s[0], s[0], s[0]), [] else: q2 = median(s) # See 'Method 3' in http://en.wikipedia.org/wiki/Quartile @@ -209,17 +256,46 @@ class Box(Graph): elif n % 4 == 1: # n is of form 4n + 1 where n >= 1 m = (n - 1) // 4 q1 = 0.25 * s[m-1] + 0.75 * s[m] - q3 = 0.75 * s[3*m] + 0.25 * s[3*m + 1] + q3 = 0.75 * s[3*m] + 0.25 * s[3*m+1] else: # n is of form 4n + 3 where n >= 1 m = (n - 3) // 4 q1 = 0.75 * s[m] + 0.25 * s[m+1] q3 = 0.25 * s[3*m+1] + 0.75 * s[3*m+2] iqr = q3 - q1 + min_s = s[0] + max_s = s[-1] if mode == 'extremes': - q0 = min(s) - q4 = max(s) + q0 = min_s + q4 = max_s + elif mode == 'tukey': + # the lowest datum still within 1.5 IQR of the lower quartile, + # and the highest datum still within 1.5 IQR of the upper + # quartile [Tukey box plot, Wikipedia ] + b0 = bisect_left(s, q1 - 1.5 * iqr) + b4 = bisect_right(s, q3 + 1.5 * iqr) + q0 = s[b0] + q4 = s[b4-1] + outliers = s[:b0] + s[b4:] + elif mode == 'stdev': + # one standard deviation above and below the mean of the data + sd = stdev(s) + b0 = bisect_left(s, q2 - sd) + b4 = bisect_right(s, q2 + sd) + q0 = s[b0] + q4 = s[b4-1] + outliers = s[:b0] + s[b4:] + elif mode == 'pstdev': + # one population standard deviation above and below + # the mean of the data + sdp = pstdev(s) + b0 = bisect_left(s, q2 - sdp) + b4 = bisect_right(s, q2 + sdp) + q0 = s[b0] + q4 = s[b4-1] + outliers = s[:b0] + s[b4:] else: + # 1.5IQR mode q0 = q1 - 1.5 * iqr q4 = q3 + 1.5 * iqr - return q0, q1, q2, q3, q4 + return (min_s, q0, q1, q2, q3, q4, max_s), outliers diff --git a/pygal/test/test_box.py b/pygal/test/test_box.py index 492c3a3..2a277a8 100644 --- a/pygal/test/test_box.py +++ b/pygal/test/test_box.py @@ -22,7 +22,7 @@ from pygal import Box as ghostedBox def test_quartiles(): a = [-2.0, 3.0, 4.0, 5.0, 8.0] # odd test data - q0, q1, q2, q3, q4 = Box._box_points(a) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(a) assert q1 == 7.0 / 4.0 assert q2 == 4.0 @@ -31,17 +31,17 @@ def test_quartiles(): assert q4 == 23 / 4.0 + 6.0 # q3 + 1.5 * iqr b = [1.0, 4.0, 6.0, 8.0] # even test data - q0, q1, q2, q3, q4 = Box._box_points(b) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(b) assert q2 == 5.0 c = [2.0, None, 4.0, 6.0, None] # odd with None elements - q0, q1, q2, q3, q4 = Box._box_points(c) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(c) assert q2 == 4.0 d = [4] - q0, q1, q2, q3, q4 = Box._box_points(d) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(d) assert q0 == 4 assert q1 == 4 @@ -49,9 +49,11 @@ def test_quartiles(): assert q3 == 4 assert q4 == 4 + def test_quartiles_min_extremes(): a = [-2.0, 3.0, 4.0, 5.0, 8.0] # odd test data - q0, q1, q2, q3, q4 = Box._box_points(a, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + a, mode='extremes') assert q1 == 7.0 / 4.0 assert q2 == 4.0 @@ -60,17 +62,20 @@ def test_quartiles_min_extremes(): assert q4 == 8.0 # max b = [1.0, 4.0, 6.0, 8.0] # even test data - q0, q1, q2, q3, q4 = Box._box_points(b, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + b, mode='extremes') assert q2 == 5.0 c = [2.0, None, 4.0, 6.0, None] # odd with None elements - q0, q1, q2, q3, q4 = Box._box_points(c, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + c, mode='extremes') assert q2 == 4.0 d = [4] - q0, q1, q2, q3, q4 = Box._box_points(d, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + d, mode='extremes') assert q0 == 4 assert q1 == 4 @@ -79,6 +84,66 @@ def test_quartiles_min_extremes(): assert q4 == 4 +def test_quartiles_tukey(): + a = [] # empty data + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + a, mode='tukey') + assert min_s == q0 == q1 == q2 == q3 == q4 == 0 + assert outliers == [] + + # https://en.wikipedia.org/wiki/Quartile example 1 + b = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49] + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + b, mode='tukey') + assert min_s == q0 == 6 + assert q1 == 20.25 + assert q2 == 40 + assert q3 == 42.75 + assert max_s == q4 == 49 + assert outliers == [] + + # previous test with added outlier 75 + c = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49, 75] + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + c, mode='tukey') + assert min_s == q0 == 6 + assert q1 == 25.5 + assert q2 == (40 + 41) / 2.0 + assert q3 == 45 + assert max_s == 75 + assert outliers == [75] + + # one more outlier, 77 + c = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49, 75, 77] + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + c, mode='tukey') + assert min_s == q0 == 6 + assert q1 == 30.75 + assert q2 == 41 + assert q3 == 47.5 + assert max_s == 77 + assert 75 in outliers + assert 77 in outliers + +def test_quartiles_stdev(): + a = [35, 42, 35, 41, 36, 6, 12, 51, 33, 27, 46, 36, 44, 53, 75, 46, 16,\ + 51, 45, 29, 25, 26, 54, 61, 27, 40, 23, 34, 51, 37] + SD = 14.67 + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + a, mode='stdev') + assert min_s == min(a) + assert max_s == max(a) + assert q2 == 36.5 + assert q4 <= q2 + SD + assert q0 >= q2 - SD + assert all(n in outliers for n in [6, 12, 16, 53, 54, 61, 75]) + + b = [5] # test for posible zero division + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + b, mode='stdev') + assert min_s == q0 == q1 == q2 == q3 == q4 == max_s == b[0] + assert outliers == [] + def test_simple_box(): box = ghostedBox() box.add('test1', [-1, 2, 3, 3.1, 3.2, 4, 5])