From 42d940f8c85d491d48d7aa393c2c60f0d322786d Mon Sep 17 00:00:00 2001 From: Diana Shkolnikov Date: Wed, 2 Dec 2015 16:47:49 -0500 Subject: [PATCH] Add simple normalizer (lowercase + remove punctuation) --- controller/search.js | 2 +- middleware/dedupe.js | 29 ++++++++++++++----- .../fixture/dedupe_elasticsearch_results.js | 25 ++++++++++++++++ 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/controller/search.js b/controller/search.js index 38bfb481..309e7647 100644 --- a/controller/search.js +++ b/controller/search.js @@ -16,7 +16,7 @@ function setup( backend, query ){ } // log clean parameters for stats - logger.info(req.clean); + logger.info('[req]', 'endpoint=' + req.path, req.clean); // backend command var cmd = { diff --git a/middleware/dedupe.js b/middleware/dedupe.js index c6e95c1c..445190c2 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -1,5 +1,4 @@ -var util = require('util'); -var logger = require('pelias-logger').get('api:middle:dedupe'); +var logger = require('pelias-logger').get('api'); var _ = require('lodash'); function setup() { @@ -45,8 +44,8 @@ function isDifferent(item1, item2) { if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { propMatch(item1.name, item2.name, 'default'); } - else if (item1.name !== item2.name) { - throw 'different'; + else { + propMatch(item1, item2, 'name'); } if (item1.hasOwnProperty('address') && item2.hasOwnProperty('address')) { @@ -68,15 +67,29 @@ function isDifferent(item1, item2) { /** * Throw exception if properties are different * - * @param item1 - * @param item2 - * @param prop + * @param {object} item1 + * @param {object} item2 + * @param {string} prop + * @throws {string} */ function propMatch(item1, item2, prop) { - if (item1[prop] !== item2[prop]) { + if (normalizeString(item1[prop]) !== normalizeString(item2[prop])) { throw 'different'; } } +/** + * Remove punctuation and lowercase + * + * @param {string} str + * @returns {string} + */ +function normalizeString(str) { + if (!str) { + return ''; + } + return _.words(str.toLowerCase()).join(' '); +} + module.exports = setup; \ No newline at end of file diff --git a/test/unit/fixture/dedupe_elasticsearch_results.js b/test/unit/fixture/dedupe_elasticsearch_results.js index ae048564..6ed761c4 100644 --- a/test/unit/fixture/dedupe_elasticsearch_results.js +++ b/test/unit/fixture/dedupe_elasticsearch_results.js @@ -24,6 +24,31 @@ module.exports = [ '_score': 1.2367082, 'confidence': 0.879 }, + { + 'center_point': { + 'lon': -76.207456, + 'lat': 40.039265 + }, + 'address': {}, + 'local_admin': 'East Lampeter', + 'admin1_abbr': 'PA', + 'name': { + 'default': 'East Lampeter, High-School' + }, + 'admin1': 'Pennsylvania', + 'locality': 'Smoketown', + 'alpha3': 'USA', + 'admin2': 'Lancaster County', + 'admin0': 'United States', + 'neighborhood': 'Greenland', + 'category': [ + 'education' + ], + '_id': '357321757', + '_type': 'osmnode', + '_score': 1.2367082, + 'confidence': 0.879 + }, { 'center_point': { 'lon': -76.23246,