From 8e7efa53bd34f09aef25de07aeb11507c0b16971 Mon Sep 17 00:00:00 2001 From: Jan Richter Date: Sat, 13 Jun 2015 11:23:39 +0200 Subject: [PATCH 1/5] added 'tukey' Box plot mode and outliers plotting Tukey box plot (introduced by John W. Tukey) takes for whiskers the lowest datum still within 1.5 IQR of the lower quartile, and the highest datum still within 1.5 IQR of the upper quartile. (https://en.wikipedia.org/wiki/Box_plot) --- pygal/config.py | 2 +- pygal/graph/box.py | 66 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/pygal/config.py b/pygal/config.py index a89239c..733afa6 100644 --- a/pygal/config.py +++ b/pygal/config.py @@ -344,7 +344,7 @@ class Config(CommonConfig): mode = Key( None, str, "Value", "Sets the mode to be used. " "(Currently only supported on box plot)", - "May be %s" % ' or '.join(["1.5IQR", "extremes"])) + "May be %s" % ' or '.join(["1.5IQR", "extremes", "tukey"])) order_min = Key( None, int, "Value", "Minimum order of scale, defaults to None") diff --git a/pygal/graph/box.py b/pygal/graph/box.py index fa29bdc..1c47834 100644 --- a/pygal/graph/box.py +++ b/pygal/graph/box.py @@ -24,6 +24,7 @@ from __future__ import division from pygal.graph.graph import Graph from pygal.util import compute_scale, decorate from pygal._compat import is_list_like +from bisect import bisect_left, bisect_right class Box(Graph): @@ -48,9 +49,13 @@ class Box(Graph): def format_maybe_quartile(x): if is_list_like(x): if self.mode == "extremes": - return 'Min: %s Q1: %s Q2: %s Q3: %s Max: %s' % tuple(map(sup, x)) + return 'Min: %s Q1: %s Q2: %s Q3: %s Max: %s' \ + % tuple(map(sup, x[1:6])) + elif self.mode == "tukey": + return 'Min: %s Lower Whisker: %s Q1: %s Q2: %s Q3: %s '\ + 'Upper Whisker: %s Max: %s' % tuple(map(sup, x)) else: - return 'Q1: %s Q2: %s Q3: %s' % tuple(map(sup, x[1:4])) + return 'Q1: %s Q2: %s Q3: %s' % tuple(map(sup, x[2:5])) else: return sup(x) return format_maybe_quartile @@ -61,7 +66,9 @@ class Box(Graph): within the rendering process """ for serie in self.series: - serie.values = self._box_points(serie.values, self.mode) + serie.values, serie.outliers = \ + self._box_points(serie.values, self.mode) + if self._min: self._box.ymin = min(self._min, self.zero) @@ -92,7 +99,7 @@ class Box(Graph): @property def _len(self): """Len is always 5 here""" - return 5 + return 7 def _boxf(self, serie): """ @@ -112,11 +119,12 @@ class Box(Graph): metadata) val = self._format(serie.values) - x_center, y_center = self._draw_box(box, serie.values, serie.index) + x_center, y_center = self._draw_box(box, serie.values[1:6], + serie.outliers, serie.index) self._tooltip_data(box, val, x_center, y_center, classes="centered") self._static_value(serie_node, val, x_center, y_center) - def _draw_box(self, parent_node, quartiles, box_index): + def _draw_box(self, parent_node, quartiles, outliers, box_index): """ Return the center of a bounding box defined by a box plot. Draws a box plot on self.svg. @@ -164,6 +172,17 @@ class Box(Graph): width=width, class_='subtle-fill reactive tooltip-trigger') + # draw outliers + for o in outliers: + self.svg.node( + parent_node, + tag='circle', + cx=left_edge+width/2, + cy=self.view.y(o), + r=3, + class_='subtle-fill reactive tooltip-trigger') + + return (left_edge + width / 2, self.view.y( sum(quartiles) / len(quartiles))) @@ -171,11 +190,13 @@ class Box(Graph): def _box_points(values, mode='1.5IQR'): """ Default mode: (mode='1.5IQR' or unset) - Return a 5-tuple of Q1 - 1.5 * IQR, Q1, Median, Q3, - and Q3 + 1.5 * IQR for a list of numeric values. + Return a 7-tuple of min, Q1 - 1.5 * IQR, Q1, Median, Q3, + Q3 + 1.5 * IQR and max for a list of numeric values. Extremes mode: (mode='extremes') - Return a 5-tuple of minimum, Q1, Median, Q3, - and maximum for a list of numeric values. + Return a 7-tuple of 2x minimum, Q1, Median, Q3, + and 2x maximum for a list of numeric values. + Outliers (Tukey) mode: (mode='tukey') + Return a 7-tuple of min, q[0..4], max and a list of outliers The iterator values may include None values. @@ -192,10 +213,11 @@ class Box(Graph): return seq[n // 2] # sort the copy in case the originals must stay in original order + outliers = [] s = sorted([x for x in values if x is not None]) n = len(s) if not n: - return 0, 0, 0, 0, 0 + return (0, 0, 0, 0, 0, 0, 0), [] else: q2 = median(s) # See 'Method 3' in http://en.wikipedia.org/wiki/Quartile @@ -209,17 +231,31 @@ class Box(Graph): elif n % 4 == 1: # n is of form 4n + 1 where n >= 1 m = (n - 1) // 4 q1 = 0.25 * s[m-1] + 0.75 * s[m] - q3 = 0.75 * s[3*m] + 0.25 * s[3*m + 1] + q3 = 0.75 * s[3*m] + 0.25 * s[3*m+1] else: # n is of form 4n + 3 where n >= 1 m = (n - 3) // 4 q1 = 0.75 * s[m] + 0.25 * s[m+1] q3 = 0.25 * s[3*m+1] + 0.75 * s[3*m+2] iqr = q3 - q1 + min_s = s[0] + max_s = s[-1] if mode == 'extremes': - q0 = min(s) - q4 = max(s) + q0 = min_s + q4 = max_s + elif mode == 'tukey': + # the lowest datum still within 1.5 IQR of the lower quartile, + # and the highest datum still within 1.5 IQR of the upper + # quartile [Tukey box plot, Wikipedia ] + b0 = bisect_left(s, q1 - 1.5 * iqr) + b4 = bisect_right(s, q3 + 1.5 * iqr) + q0 = s[b0] + q4 = s[b4-1] + outliers = s[:b0] + s[b4:] + #print "Q: [%s,%s,%s,%s,%s] O: %s" \ + # % (q0, q1, q2, q3, q4, outliers) + else: q0 = q1 - 1.5 * iqr q4 = q3 + 1.5 * iqr - return q0, q1, q2, q3, q4 + return (min_s, q0, q1, q2, q3, q4, max_s), outliers From 81e50cd8340042a38e83917adaf11b1710b63fb5 Mon Sep 17 00:00:00 2001 From: Jan Richter Date: Sun, 14 Jun 2015 12:17:59 +0200 Subject: [PATCH 2/5] tukey box plot tests box plot tests updated according to the new return format of _box_points() method --- pygal/graph/box.py | 4 +-- pygal/test/test_box.py | 63 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/pygal/graph/box.py b/pygal/graph/box.py index 1c47834..e662675 100644 --- a/pygal/graph/box.py +++ b/pygal/graph/box.py @@ -212,8 +212,8 @@ class Box(Graph): else: # seq has an odd length return seq[n // 2] - # sort the copy in case the originals must stay in original order outliers = [] + # sort the copy in case the originals must stay in original order s = sorted([x for x in values if x is not None]) n = len(s) if not n: @@ -252,8 +252,6 @@ class Box(Graph): q0 = s[b0] q4 = s[b4-1] outliers = s[:b0] + s[b4:] - #print "Q: [%s,%s,%s,%s,%s] O: %s" \ - # % (q0, q1, q2, q3, q4, outliers) else: q0 = q1 - 1.5 * iqr diff --git a/pygal/test/test_box.py b/pygal/test/test_box.py index 492c3a3..95105d6 100644 --- a/pygal/test/test_box.py +++ b/pygal/test/test_box.py @@ -22,7 +22,7 @@ from pygal import Box as ghostedBox def test_quartiles(): a = [-2.0, 3.0, 4.0, 5.0, 8.0] # odd test data - q0, q1, q2, q3, q4 = Box._box_points(a) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(a) assert q1 == 7.0 / 4.0 assert q2 == 4.0 @@ -31,17 +31,17 @@ def test_quartiles(): assert q4 == 23 / 4.0 + 6.0 # q3 + 1.5 * iqr b = [1.0, 4.0, 6.0, 8.0] # even test data - q0, q1, q2, q3, q4 = Box._box_points(b) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(b) assert q2 == 5.0 c = [2.0, None, 4.0, 6.0, None] # odd with None elements - q0, q1, q2, q3, q4 = Box._box_points(c) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(c) assert q2 == 4.0 d = [4] - q0, q1, q2, q3, q4 = Box._box_points(d) + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points(d) assert q0 == 4 assert q1 == 4 @@ -49,9 +49,11 @@ def test_quartiles(): assert q3 == 4 assert q4 == 4 + def test_quartiles_min_extremes(): a = [-2.0, 3.0, 4.0, 5.0, 8.0] # odd test data - q0, q1, q2, q3, q4 = Box._box_points(a, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + a, mode='extremes') assert q1 == 7.0 / 4.0 assert q2 == 4.0 @@ -60,17 +62,20 @@ def test_quartiles_min_extremes(): assert q4 == 8.0 # max b = [1.0, 4.0, 6.0, 8.0] # even test data - q0, q1, q2, q3, q4 = Box._box_points(b, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + b, mode='extremes') assert q2 == 5.0 c = [2.0, None, 4.0, 6.0, None] # odd with None elements - q0, q1, q2, q3, q4 = Box._box_points(c, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + c, mode='extremes') assert q2 == 4.0 d = [4] - q0, q1, q2, q3, q4 = Box._box_points(d, mode='extremes') + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + d, mode='extremes') assert q0 == 4 assert q1 == 4 @@ -79,6 +84,48 @@ def test_quartiles_min_extremes(): assert q4 == 4 +def test_quartiles_tukey(): + a = [] # empty data + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + a, mode='tukey') + assert min_s == q0 == q1 == q2 == q3 == q4 == 0 + assert outliers == [] + + # https://en.wikipedia.org/wiki/Quartile example 1 + b = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49] + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + b, mode='tukey') + assert min_s == q0 == 6 + assert q1 == 20.25 + assert q2 == 40 + assert q3 == 42.75 + assert max_s == q4 == 49 + assert outliers == [] + + # previous test with added outlier 75 + c = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49, 75] + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + c, mode='tukey') + assert min_s == q0 == 6 + assert q1 == 25.5 + assert q2 == (40 + 41) / 2.0 + assert q3 == 45 + assert max_s == 75 + assert outliers == [75] + + # one more outlier, -30 + c = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49, 75, 77] + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + c, mode='tukey') + assert min_s == q0 == 6 + assert q1 == 30.75 + assert q2 == 41 + assert q3 == 47.5 + assert max_s == 77 + assert 75 in outliers + assert 77 in outliers + + def test_simple_box(): box = ghostedBox() box.add('test1', [-1, 2, 3, 3.1, 3.2, 4, 5]) From 04bc001793ecb4ba7dae5ebf6aed76c482c51cf6 Mon Sep 17 00:00:00 2001 From: Jan Richter Date: Sun, 14 Jun 2015 14:54:50 +0200 Subject: [PATCH 3/5] new stdev and pstdev modes of Box plot --- pygal/config.py | 3 ++- pygal/graph/box.py | 46 +++++++++++++++++++++++++++++++++++++++--- pygal/test/test_box.py | 14 ++++++++++++- 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/pygal/config.py b/pygal/config.py index 733afa6..3e4122a 100644 --- a/pygal/config.py +++ b/pygal/config.py @@ -344,7 +344,8 @@ class Config(CommonConfig): mode = Key( None, str, "Value", "Sets the mode to be used. " "(Currently only supported on box plot)", - "May be %s" % ' or '.join(["1.5IQR", "extremes", "tukey"])) + "May be %s" % ' or '.join(["1.5IQR", "extremes", "tukey", "stdev",\ + "pstdev"])) order_min = Key( None, int, "Value", "Minimum order of scale, defaults to None") diff --git a/pygal/graph/box.py b/pygal/graph/box.py index e662675..1469bee 100644 --- a/pygal/graph/box.py +++ b/pygal/graph/box.py @@ -51,7 +51,7 @@ class Box(Graph): if self.mode == "extremes": return 'Min: %s Q1: %s Q2: %s Q3: %s Max: %s' \ % tuple(map(sup, x[1:6])) - elif self.mode == "tukey": + elif self.mode in ["tukey", "stdev", "pstdev"]: return 'Min: %s Lower Whisker: %s Q1: %s Q2: %s Q3: %s '\ 'Upper Whisker: %s Max: %s' % tuple(map(sup, x)) else: @@ -195,8 +195,15 @@ class Box(Graph): Extremes mode: (mode='extremes') Return a 7-tuple of 2x minimum, Q1, Median, Q3, and 2x maximum for a list of numeric values. - Outliers (Tukey) mode: (mode='tukey') + Tukey mode: (mode='tukey') Return a 7-tuple of min, q[0..4], max and a list of outliers + Outliers are considered values x: x < q1 - IQR or x > q3 + IQR + SD mode: (mode='stdev') + Return a 7-tuple of min, q[0..4], max and a list of outliers + Outliers are considered values x: x < q2 - SD or x > q2 + SD + SDp mode: (mode='pstdev') + Return a 7-tuple of min, q[0..4], max and a list of outliers + Outliers are considered values x: x < q2 - SDp or x > q2 + SDp The iterator values may include None values. @@ -212,6 +219,21 @@ class Box(Graph): else: # seq has an odd length return seq[n // 2] + def mean(seq): + return sum(seq) /len(seq) + + def stdev(seq): + m = mean(seq) + l = len(seq) + v = sum((n - m)**2 for n in seq) / (l - 1) # variance + return v**0.5 # sqrt + + def pstdev(seq): + m = mean(seq) + l = len(seq) + v = sum((n - m)**2 for n in seq) / l # variance + return v**0.5 # sqrt + outliers = [] # sort the copy in case the originals must stay in original order s = sorted([x for x in values if x is not None]) @@ -252,7 +274,25 @@ class Box(Graph): q0 = s[b0] q4 = s[b4-1] outliers = s[:b0] + s[b4:] - + elif mode == 'stdev': + # one standard deviation above and below the mean of the data + sd = stdev(s) + print s, sd + b0 = bisect_left(s, q2 - sd) + b4 = bisect_right(s, q2 + sd) + q0 = s[b0] + q4 = s[b4-1] + outliers = s[:b0] + s[b4:] + elif mode == 'pstdev': + # one population standard deviation above and below + # the mean of the data + sdp = pstdev(s) + print s, sd + b0 = bisect_left(s, q2 - sdp) + b4 = bisect_right(s, q2 + sdp) + q0 = s[b0] + q4 = s[b4-1] + outliers = s[:b0] + s[b4:] else: q0 = q1 - 1.5 * iqr q4 = q3 + 1.5 * iqr diff --git a/pygal/test/test_box.py b/pygal/test/test_box.py index 95105d6..a3f98f7 100644 --- a/pygal/test/test_box.py +++ b/pygal/test/test_box.py @@ -113,7 +113,7 @@ def test_quartiles_tukey(): assert max_s == 75 assert outliers == [75] - # one more outlier, -30 + # one more outlier, 77 c = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49, 75, 77] (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( c, mode='tukey') @@ -125,6 +125,18 @@ def test_quartiles_tukey(): assert 75 in outliers assert 77 in outliers +def test_quartiles_stdev(): + a = [35, 42, 35, 41, 36, 6, 12, 51, 33, 27, 46, 36, 44, 53, 75, 46, 16,\ + 51, 45, 29, 25, 26, 54, 61, 27, 40, 23, 34, 51, 37] + SD = 14.67 + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + a, mode='stdev') + assert min_s == min(a) + assert max_s == max(a) + assert q2 == 36.5 + assert q4 <= q2 + SD + assert q0 >= q2 - SD + assert all(n in outliers for n in [6, 12, 16, 53, 54, 61, 75]) def test_simple_box(): box = ghostedBox() From 17a13f38ec449ed03dc12aff415ebdf25c37a56f Mon Sep 17 00:00:00 2001 From: Jan Richter Date: Sun, 14 Jun 2015 19:59:18 +0200 Subject: [PATCH 4/5] fixed stdev mode of Box plot for inputs of size 1 --- pygal/graph/box.py | 6 ++++-- pygal/test/test_box.py | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pygal/graph/box.py b/pygal/graph/box.py index 1469bee..4d97c43 100644 --- a/pygal/graph/box.py +++ b/pygal/graph/box.py @@ -55,6 +55,7 @@ class Box(Graph): return 'Min: %s Lower Whisker: %s Q1: %s Q2: %s Q3: %s '\ 'Upper Whisker: %s Max: %s' % tuple(map(sup, x)) else: + # 1.5IQR mode return 'Q1: %s Q2: %s Q3: %s' % tuple(map(sup, x[2:5])) else: return sup(x) @@ -240,6 +241,8 @@ class Box(Graph): n = len(s) if not n: return (0, 0, 0, 0, 0, 0, 0), [] + elif n == 1: + return (s[0], s[0], s[0], s[0], s[0], s[0], s[0]), [] else: q2 = median(s) # See 'Method 3' in http://en.wikipedia.org/wiki/Quartile @@ -277,7 +280,6 @@ class Box(Graph): elif mode == 'stdev': # one standard deviation above and below the mean of the data sd = stdev(s) - print s, sd b0 = bisect_left(s, q2 - sd) b4 = bisect_right(s, q2 + sd) q0 = s[b0] @@ -287,13 +289,13 @@ class Box(Graph): # one population standard deviation above and below # the mean of the data sdp = pstdev(s) - print s, sd b0 = bisect_left(s, q2 - sdp) b4 = bisect_right(s, q2 + sdp) q0 = s[b0] q4 = s[b4-1] outliers = s[:b0] + s[b4:] else: + # 1.5IQR mode q0 = q1 - 1.5 * iqr q4 = q3 + 1.5 * iqr return (min_s, q0, q1, q2, q3, q4, max_s), outliers diff --git a/pygal/test/test_box.py b/pygal/test/test_box.py index a3f98f7..2a277a8 100644 --- a/pygal/test/test_box.py +++ b/pygal/test/test_box.py @@ -138,6 +138,12 @@ def test_quartiles_stdev(): assert q0 >= q2 - SD assert all(n in outliers for n in [6, 12, 16, 53, 54, 61, 75]) + b = [5] # test for posible zero division + (min_s, q0, q1, q2, q3, q4, max_s), outliers = Box._box_points( + b, mode='stdev') + assert min_s == q0 == q1 == q2 == q3 == q4 == max_s == b[0] + assert outliers == [] + def test_simple_box(): box = ghostedBox() box.add('test1', [-1, 2, 3, 3.1, 3.2, 4, 5]) From 7bf1e0f4e55ce0d001aca5802f51666d429d8339 Mon Sep 17 00:00:00 2001 From: Jan Richter Date: Sun, 14 Jun 2015 20:38:49 +0200 Subject: [PATCH 5/5] fixed _len comment --- pygal/graph/box.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygal/graph/box.py b/pygal/graph/box.py index 4d97c43..29e34b5 100644 --- a/pygal/graph/box.py +++ b/pygal/graph/box.py @@ -99,7 +99,7 @@ class Box(Graph): @property def _len(self): - """Len is always 5 here""" + """Len is always 7 here""" return 7 def _boxf(self, serie):