From df93be7543fc37d303d12fc93c3c988191744161 Mon Sep 17 00:00:00 2001 From: Diana Shkolnikov Date: Thu, 17 Dec 2015 13:51:06 -0500 Subject: [PATCH] Fix deduping for non-ascii strings --- middleware/dedupe.js | 2 +- .../dedupe_elasticsearch_nonascii_results.js | 61 +++++++++++++++++++ test/unit/middleware/dedupe.js | 18 ++++++ 3 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 test/unit/fixture/dedupe_elasticsearch_nonascii_results.js diff --git a/middleware/dedupe.js b/middleware/dedupe.js index b02ca2a3..87bf20db 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -92,7 +92,7 @@ function normalizeString(str) { if (!str) { return ''; } - return _.words(str.toLowerCase()).join(' '); + return str.toLowerCase().split(/[ ,-]+/).join(' '); } diff --git a/test/unit/fixture/dedupe_elasticsearch_nonascii_results.js b/test/unit/fixture/dedupe_elasticsearch_nonascii_results.js new file mode 100644 index 00000000..3341141a --- /dev/null +++ b/test/unit/fixture/dedupe_elasticsearch_nonascii_results.js @@ -0,0 +1,61 @@ +module.exports = [ + { + 'id': 'foobar', + 'gid': 'osm:venue:foobar', + 'layer': 'venue', + 'source': 'osm', + 'name': { + 'default': '万里长城万里长城' + }, + 'country_a': 'CHN', + 'country': 'China', + 'region': 'Beijing', + 'confidence': 0.733 + }, + { + 'id': '185883777', + 'gid': 'osm:venue:185883777', + 'layer': 'venue', + 'source': 'osm', + 'name': { + 'default': '万里长城' + }, + 'country_a': 'CHN', + 'country': 'China', + 'region': 'Beijing', + 'confidence': 0.733 + }, + { + 'id': '1877602615', + 'gid': 'osm:venue:1877602615', + 'layer': 'venue', + 'source': 'osm', + 'name': { + 'default': '万里花' + }, + 'country_a': 'JPN', + 'country': 'Japan', + 'region': 'Tokyo', + 'county': '豊島区', + 'locality': 'Tokyo', + 'neighbourhood': '2丁目', + 'confidence': 0.646 + }, + { + 'id': '231404818', + 'gid': 'osm:venue:231404818', + 'layer': 'venue', + 'source': 'osm', + 'name': { + 'default': '万里加油站' + }, + 'address': { + 'street': 'S308', + 'postalcode': '312044' + }, + 'country_a': 'CHN', + 'country': 'China', + 'region': 'Zhejiang', + 'confidence': 0.646 + } +]; \ No newline at end of file diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index 31d4bf6f..6707d1f9 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -1,4 +1,5 @@ var data = require('../fixture/dedupe_elasticsearch_results'); +var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); var dedupe = require('../../../middleware/dedupe')(); module.exports.tests = {}; @@ -22,6 +23,23 @@ module.exports.tests.dedupe = function(test, common) { }); }); + test('handle non-ascii gracefully', function(t) { + var req = { + clean: { + size: 100 + } + }; + var res = { + data: nonAsciiData + }; + + var expectedCount = 4; + dedupe(req, res, function () { + t.equal(res.data.length, expectedCount, 'none were removed'); + t.end(); + }); + }); + test('truncate results based on specified size', function(t) { var req = { clean: {