Browse Source

Merge pull request #390 from pelias/dedupe-non-ascii

Fix deduping for non-ascii strings
pull/389/head
Julian Simioni 9 years ago
parent
commit
838e66cb1d
  1. 2
      middleware/dedupe.js
  2. 61
      test/unit/fixture/dedupe_elasticsearch_nonascii_results.js
  3. 18
      test/unit/middleware/dedupe.js

2
middleware/dedupe.js

@ -92,7 +92,7 @@ function normalizeString(str) {
if (!str) { if (!str) {
return ''; return '';
} }
return _.words(str.toLowerCase()).join(' '); return str.toLowerCase().split(/[ ,-]+/).join(' ');
} }

61
test/unit/fixture/dedupe_elasticsearch_nonascii_results.js

@ -0,0 +1,61 @@
module.exports = [
{
'id': 'foobar',
'gid': 'osm:venue:foobar',
'layer': 'venue',
'source': 'osm',
'name': {
'default': '万里长城万里长城'
},
'country_a': 'CHN',
'country': 'China',
'region': 'Beijing',
'confidence': 0.733
},
{
'id': '185883777',
'gid': 'osm:venue:185883777',
'layer': 'venue',
'source': 'osm',
'name': {
'default': '万里长城'
},
'country_a': 'CHN',
'country': 'China',
'region': 'Beijing',
'confidence': 0.733
},
{
'id': '1877602615',
'gid': 'osm:venue:1877602615',
'layer': 'venue',
'source': 'osm',
'name': {
'default': '万里花'
},
'country_a': 'JPN',
'country': 'Japan',
'region': 'Tokyo',
'county': '豊島区',
'locality': 'Tokyo',
'neighbourhood': '2丁目',
'confidence': 0.646
},
{
'id': '231404818',
'gid': 'osm:venue:231404818',
'layer': 'venue',
'source': 'osm',
'name': {
'default': '万里加油站'
},
'address': {
'street': 'S308',
'postalcode': '312044'
},
'country_a': 'CHN',
'country': 'China',
'region': 'Zhejiang',
'confidence': 0.646
}
];

18
test/unit/middleware/dedupe.js

@ -1,4 +1,5 @@
var data = require('../fixture/dedupe_elasticsearch_results'); var data = require('../fixture/dedupe_elasticsearch_results');
var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results');
var dedupe = require('../../../middleware/dedupe')(); var dedupe = require('../../../middleware/dedupe')();
module.exports.tests = {}; module.exports.tests = {};
@ -22,6 +23,23 @@ module.exports.tests.dedupe = function(test, common) {
}); });
}); });
test('handle non-ascii gracefully', function(t) {
var req = {
clean: {
size: 100
}
};
var res = {
data: nonAsciiData
};
var expectedCount = 4;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'none were removed');
t.end();
});
});
test('truncate results based on specified size', function(t) { test('truncate results based on specified size', function(t) {
var req = { var req = {
clean: { clean: {

Loading…
Cancel
Save