From 14e6c6303f9419ee69d885812ed43f3f914fdb94 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 29 Oct 2018 17:13:52 +0100 Subject: [PATCH 1/6] refactor middleware dedupe for readability --- helper/diffPlaces.js | 206 +++++++++++++++++-------------------------- middleware/dedupe.js | 135 ++++++++++++++-------------- 2 files changed, 153 insertions(+), 188 deletions(-) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 712c7959..ea15bd79 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -1,176 +1,136 @@ -var _ = require('lodash'); -var placeTypes = require('./placeTypes'); +const _ = require('lodash'); +const placeTypes = require('./placeTypes'); +const field = require('../helper/fieldValue'); /** * Compare the layer properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Returns false if the objects are the same, else true. */ -function assertLayerMatch(item1, item2) { - if (item1.layer === item2.layer) { - return false; - } - - throw new Error('different'); +function isLayerDifferent(item1, item2){ + return isPropertyDifferent(item1, item2, 'layer'); } /** - * Compare the parent.*_id properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Compare the parent properties if they exist. + * Returns false if the objects are the same, else true. */ -function assertParentHierarchyMatch(item1, item2) { - // if neither object has parent, assume same - if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) { - return false; - } +function isParentHierarchyDifferent(item1, item2){ + let parent1 = _.get(item1, 'parent'); + let parent2 = _.get(item2, 'parent'); - // if both have parent, do the rest of the checking - if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { - placeTypes.forEach(function (placeType) { - // don't consider its own id - if (placeType === item1.layer) { - return; - } - propMatch(item1.parent, item2.parent, placeType + '_id'); + // if neither object has parent info, we consider them the same + if( !parent1 && !parent2 ){ return false; } + + // both have parent info + if( _.isPlainObject(parent1) && _.isPlainObject(parent2) ){ + + // iterate over all the placetypes, comparing between items + return placeTypes.some( placeType => { + + // skip the parent field corresponding to the item placetype + if( placeType === item1.layer ){ return false; } + + // ensure the parent ids are the same for all placetypes + return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' ); }); - return false; } - // if one has parent and the other doesn't consider different - throw new Error('different'); + // if one has parent info and the other doesn't, we consider them different + return true; } /** - * Compare the name.* properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Compare the name properties if they exist. + * Returns false if the objects are the same, else true. */ -function assertNameMatch(item1, item2) { - if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { - for (var lang in item1.name) { - if(item2.name.hasOwnProperty(lang) || lang === 'default') { +function isNameDifferent(item1, item2){ + let names1 = _.get(item1, 'name'); + let names2 = _.get(item2, 'name'); + + // if neither object has name info, we consider them the same + if( !names1 && !names2 ){ return false; } + + // if both have name info + if( _.isPlainObject(names1) && _.isPlainObject(names2) ){ + + // iterate over all the languages in item1, comparing between items + return Object.keys(names1).some( lang => { + + // do not consider absence of an additional name as a difference + // but strictly enfore that 'default' must be present and match + if( _.has(names2, lang) || lang === 'default' ){ + // do not consider absence of an additional name as a difference - propMatch(item1.name, item2.name, lang); + return isPropertyDifferent(names1, names2, lang); } - } - } - else { - propMatch(item1, item2, 'name'); + }); } + + // if one has name info and the other doesn't, we consider them different + return true; } /** * Compare the address_parts properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Returns false if the objects are the same, else true. */ -function assertAddressMatch(item1, item2) { - // if neither record has address, assume same - if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) { - return false; - } +function isAddressDifferent(item1, item2){ + let address1 = _.get(item1, 'address_parts'); + let address2 = _.get(item2, 'address_parts'); - // if both have address, check parts - if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) { - propMatch(item1.address_parts, item2.address_parts, 'number'); - propMatch(item1.address_parts, item2.address_parts, 'street'); + // if neither object has address info, we consider them the same + if( !address1 && !address2 ){ return false; } + + // if both have address info + if( _.isPlainObject(address1) && _.isPlainObject(address2) ){ + + if( isPropertyDifferent(address1, address2, 'number') ){ return true; } + if( isPropertyDifferent(address1, address2, 'street') ){ return true; } // only compare zip if both records have it, otherwise just ignore and assume it's the same // since by this time we've already compared parent hierarchies - if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) { - propMatch(item1.address_parts, item2.address_parts, 'zip'); + if( _.has(address1, 'zip') && _.has(address2, 'zip') ){ + if( isPropertyDifferent(address1, address2, 'zip') ){ return true; } } return false; } // one has address and the other doesn't, different! - throw new Error('different'); + return true; } /** * Compare the two records and return true if they differ and false if same. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} */ -function isDifferent(item1, item2) { - try { - assertLayerMatch(item1, item2); - assertParentHierarchyMatch(item1, item2); - assertNameMatch(item1, item2); - assertAddressMatch(item1, item2); - } - catch (err) { - if (err.message === 'different') { - return true; - } - throw err; - } - +function isDifferent(item1, item2){ + if( isLayerDifferent( item1, item2 ) ){ return true; } + if( isParentHierarchyDifferent( item1, item2 ) ){ return true; } + if( isNameDifferent( item1, item2 ) ){ return true; } + if( isAddressDifferent( item1, item2 ) ){ return true; } return false; } /** - * Throw exception if properties are different - * - * @param {object} item1 - * @param {object} item2 - * @param {string} prop - * @throws {Error} + * return true if properties are different */ -function propMatch(item1, item2, prop) { - var prop1 = item1[prop]; - var prop2 = item2[prop]; +function isPropertyDifferent(item1, item2, prop ){ - // in the case the property is an array (currently only in parent schema) - // simply take the 1st item. this will change in the near future to support multiple hierarchies - if (_.isArray(prop1)) { prop1 = prop1[0]; } - if (_.isArray(prop2)) { prop2 = prop2[0]; } + // if neither item has prop, we consider them the same + if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; } - if (normalizeString(prop1) !== normalizeString(prop2)) { - throw new Error('different'); - } + // handle arrays and other non-string values + var prop1 = field.getStringValue( _.get( item1, prop ) ); + var prop2 = field.getStringValue( _.get( item2, prop ) ); + + // compare strings + return normalizeString(prop1) !== normalizeString(prop2); } /** - * Remove punctuation and lowercase - * - * @param {string} str - * @returns {string} + * lowercase characters and remove some punctuation */ -function normalizeString(str) { - if (!_.isString(str)) { - return str; - } - - if (_.isEmpty(str)) { - return ''; - } - +function normalizeString(str){ return str.toLowerCase().split(/[ ,-]+/).join(' '); } diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 4b637ed5..ba9e70e7 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -3,87 +3,92 @@ const _ = require('lodash'); const isDifferent = require('../helper/diffPlaces').isDifferent; const field = require('../helper/fieldValue'); -function setup() { - return dedupeResults; -} - function dedupeResults(req, res, next) { - // do nothing if no result data set - if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) { - return next(); - } - // loop through data items and only copy unique items to uniqueResults - var uniqueResults = []; + // do nothing if request data is invalid + if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); } + + // do nothing if no result data is invalid + if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); } + + // loop through data items and only copy unique items to unique + // note: the first reqults must always be unique! + let unique = [ res.data[0] ]; + + // convenience function to search unique array for an existing element which matches a hit + let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit)); + + // iterate over res.data using an old-school for loop starting at index 1 + // we can call break at any time to end the iterator + for( let i=1; i= req.clean.size ){ break; } + } - res.data = uniqueResults; + // replace the original data with only the unique hits + res.data = unique; next(); } -function isPreferred(existing, candidateReplacement) { - // NOTE: we are assuming here that the layer for both records is the same - const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip'); +// return true if the second argument represents a hit which is preferred +// to the hit in the first argument +function isPreferred(existingHit, candidateHit) { + // prefer a record with a postcode // https://github.com/pelias/api/issues/872 - const candidateHasZip = hasZip(candidateReplacement); - const existingHasZip = hasZip(existing); - if (candidateHasZip !== existingHasZip) { - return candidateHasZip; - } + if( !_.has(existingHit, 'address_parts.zip') && + _.has(candidateHit, 'address_parts.zip') ){ return true; } - //bind the trumps function to the data items to keep the rest of the function clean - var trumpsFunc = trumps.bind(null, existing, candidateReplacement); - - return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred - trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA - trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time -} - -function trumps(existing, candidateReplacement, loserSource, winnerSource) { - return existing.source === loserSource && candidateReplacement.source === winnerSource; + // prefer certain sources over others + switch( existingHit.source ){ + // sources are the same + case candidateHit.source: return false; + // WOF has bbox and is generally preferred + case 'geonames': return candidateHit.source === 'whosonfirst'; + // addresses are generally better in OA + case 'openstreetmap': return candidateHit.source === 'openaddresses'; + // venues are better in OSM + case 'whosonfirst': return candidateHit.source === 'openstreetmap'; + // no preference, keep existing hit + default: return false; + } } -module.exports = setup; +module.exports = function() { + return dedupeResults; +}; From c0a0663e217714ab2a0a414c9e5a8e2ffb9c50c7 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 29 Oct 2018 19:39:36 +0100 Subject: [PATCH 2/6] feat(dedupe): treat all non-canonical layers and analogous to a venue, prefer non-canonical records --- helper/TypeMapping.js | 19 +++++++++++ helper/diffPlaces.js | 11 +++++- middleware/dedupe.js | 5 +++ ...dupe_elasticsearch_custom_layer_results.js | 32 +++++++++++++++++ test/unit/middleware/dedupe.js | 34 +++++++++++++++---- 5 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js diff --git a/helper/TypeMapping.js b/helper/TypeMapping.js index 97d87a26..7f277c3e 100644 --- a/helper/TypeMapping.js +++ b/helper/TypeMapping.js @@ -1,6 +1,9 @@ const _ = require('lodash'); const elasticsearch = require('elasticsearch'); +// a list of the canonical sources included in the default Pelias configuration +const CANONICAL_SOURCES = ['whosonfirst', 'openstreetmap', 'openaddresses', 'geonames']; + var TypeMapping = function(){ // A list of all sources @@ -75,6 +78,22 @@ TypeMapping.prototype.generateMappings = function(){ this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases); }; +// return a list of all sources which are part of the canonical Pelias configuration +TypeMapping.prototype.getCanonicalSources = function(){ + return CANONICAL_SOURCES; +}; + +// generate a list of all layers which are part of the canonical Pelias configuration +TypeMapping.prototype.getCanonicalLayers = function(){ + var canonicalLayers = []; + for( var source in this.layers_by_source ){ + if( _.includes( CANONICAL_SOURCES, source ) ){ + canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) ); + } + } + return canonicalLayers; +}; + // load values from targets block TypeMapping.prototype.loadTargets = function( targetsBlock ){ diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index ea15bd79..0f327bfb 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -1,5 +1,6 @@ const _ = require('lodash'); const placeTypes = require('./placeTypes'); +const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers(); const field = require('../helper/fieldValue'); /** @@ -7,7 +8,15 @@ const field = require('../helper/fieldValue'); * Returns false if the objects are the same, else true. */ function isLayerDifferent(item1, item2){ - return isPropertyDifferent(item1, item2, 'layer'); + if( isPropertyDifferent(item1, item2, 'layer') ){ + // consider all custom layers to be analogous to a venue + if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) && + ( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){ + return false; + } + return true; + } + return false; } /** diff --git a/middleware/dedupe.js b/middleware/dedupe.js index ba9e70e7..175c5887 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -1,6 +1,7 @@ const logger = require('pelias-logger').get('api'); const _ = require('lodash'); const isDifferent = require('../helper/diffPlaces').isDifferent; +const canonicalSources = require('../helper/type_mapping').getCanonicalSources(); const field = require('../helper/fieldValue'); function dedupeResults(req, res, next) { @@ -74,6 +75,10 @@ function isPreferred(existingHit, candidateHit) { if( !_.has(existingHit, 'address_parts.zip') && _.has(candidateHit, 'address_parts.zip') ){ return true; } + // prefer non-canonical sources over canonical ones + if( !_.includes(canonicalSources, candidateHit.source) && + _.includes(canonicalSources, existingHit.source) ){ return true; } + // prefer certain sources over others switch( existingHit.source ){ // sources are the same diff --git a/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js b/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js new file mode 100644 index 00000000..4c1f5f58 --- /dev/null +++ b/test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js @@ -0,0 +1,32 @@ +module.exports = [ + { + '_id': '101914069', + 'layer': 'venue', + 'source': 'openstreetmap', + 'name': { + 'default': 'Nike World Headquarters' + }, + 'parent': { + 'country_a': ['USA'], + 'country': ['United States'], + 'region': ['Oregon'], + 'region_id': ['85688513'] + }, + 'confidence': 0.98 + }, + { + '_id': '2456::trimet::major_employer', + 'layer': 'major_employer', + 'source': 'transit', + 'name': { + 'default': 'Nike World Headquarters' + }, + 'parent': { + 'country_a': ['USA'], + 'country': ['United States'], + 'region': ['Oregon'], + 'region_id': ['85688513'] + }, + 'confidence': 0.50 + } +]; \ No newline at end of file diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index 9b4d04c3..674a2b10 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -1,5 +1,6 @@ var data = require('../fixture/dedupe_elasticsearch_results'); var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); +var customLayerData = require('../fixture/dedupe_elasticsearch_custom_layer_results'); var dedupe = require('../../../middleware/dedupe')(); module.exports.tests = {}; @@ -56,10 +57,29 @@ module.exports.tests.dedupe = function(test, common) { t.end(); }); }); + + test('deduplicate between custom layers and venue layers', function(t) { + var req = { + clean: { + size: 20 + } + }; + var res = { + data: customLayerData + }; + var expected = customLayerData[1]; // non-canonical record + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'only one result displayed'); + t.equal(res.data[0], expected, 'non-canonical data is preferred'); + t.end(); + }); + }); }; -module.exports.tests.trump = function(test, common) { - test('whosonfirst trumps geonames, replace', function (t) { + +module.exports.tests.priority = function(test, common) { + test('whosonfirst takes priority over geonames, replace', function (t) { var req = { clean: { text: 'Lancaster', @@ -91,7 +111,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('whosonfirst trumps geonames, no replace', function (t) { + test('whosonfirst takes priority over geonames, no replace', function (t) { var req = { clean: { text: 'Lancaster', @@ -123,7 +143,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openstreetmap trumps whosonfirst venues', function (t) { + test('openstreetmap takes priority over whosonfirst venues', function (t) { var req = { clean: { text: 'Lancaster Dairy Farm', @@ -155,7 +175,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openaddresses trumps openstreetmap', function (t) { + test('openaddresses takes priority over openstreetmap', function (t) { var req = { clean: { text: '100 Main St', @@ -187,7 +207,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('openaddresses with zip trumps openaddresses without zip', function (t) { + test('openaddresses with zip takes priority over openaddresses without zip', function (t) { var req = { clean: { text: '100 Main St', @@ -223,7 +243,7 @@ module.exports.tests.trump = function(test, common) { }); }); - test('osm with zip trumps openaddresses without zip', function (t) { + test('osm with zip takes priority over openaddresses without zip', function (t) { var req = { clean: { text: '100 Main St', From 077bd3f78ed7174aeaa9ace351d153fb2d42e205 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 30 Oct 2018 15:33:26 +0100 Subject: [PATCH 3/6] feat(dedupe): move canonical_sources config to pelias/config --- helper/TypeMapping.js | 21 ++++++++++++--------- middleware/dedupe.js | 8 ++++---- package.json | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/helper/TypeMapping.js b/helper/TypeMapping.js index 7f277c3e..81047fa3 100644 --- a/helper/TypeMapping.js +++ b/helper/TypeMapping.js @@ -1,9 +1,6 @@ const _ = require('lodash'); const elasticsearch = require('elasticsearch'); -// a list of the canonical sources included in the default Pelias configuration -const CANONICAL_SOURCES = ['whosonfirst', 'openstreetmap', 'openaddresses', 'geonames']; - var TypeMapping = function(){ // A list of all sources @@ -29,6 +26,11 @@ var TypeMapping = function(){ */ this.layer_aliases = {}; + /* + * A list of the canonical sources included in the default Pelias configuration + */ + this.canonical_sources = []; + /* * An object that contains all sources or aliases. The key is the source or alias, * the value is either that source, or the canonical name for that alias if it's an alias. @@ -68,6 +70,11 @@ TypeMapping.prototype.setLayerAliases = function( aliases ){ this.layer_aliases = aliases; }; +// canonical sources setter +TypeMapping.prototype.setCanonicalSources = function( sources ){ + this.canonical_sources = sources; +}; + // generate mappings after setters have been run TypeMapping.prototype.generateMappings = function(){ this.sources = Object.keys( this.layers_by_source ); @@ -78,16 +85,11 @@ TypeMapping.prototype.generateMappings = function(){ this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases); }; -// return a list of all sources which are part of the canonical Pelias configuration -TypeMapping.prototype.getCanonicalSources = function(){ - return CANONICAL_SOURCES; -}; - // generate a list of all layers which are part of the canonical Pelias configuration TypeMapping.prototype.getCanonicalLayers = function(){ var canonicalLayers = []; for( var source in this.layers_by_source ){ - if( _.includes( CANONICAL_SOURCES, source ) ){ + if( _.includes( this.canonical_sources, source ) ){ canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) ); } } @@ -103,6 +105,7 @@ TypeMapping.prototype.loadTargets = function( targetsBlock ){ this.setSourceAliases( targetsBlock.source_aliases || {} ); this.setLayersBySource( targetsBlock.layers_by_source || {} ); this.setLayerAliases( targetsBlock.layer_aliases || {} ); + this.setCanonicalSources( targetsBlock.canonical_sources || [] ); // generate the mappings this.generateMappings(); diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 175c5887..18054335 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -1,7 +1,7 @@ const logger = require('pelias-logger').get('api'); const _ = require('lodash'); const isDifferent = require('../helper/diffPlaces').isDifferent; -const canonicalSources = require('../helper/type_mapping').getCanonicalSources(); +const canonical_sources = require('../helper/type_mapping').canonical_sources; const field = require('../helper/fieldValue'); function dedupeResults(req, res, next) { @@ -13,7 +13,7 @@ function dedupeResults(req, res, next) { if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); } // loop through data items and only copy unique items to unique - // note: the first reqults must always be unique! + // note: the first results must always be unique! let unique = [ res.data[0] ]; // convenience function to search unique array for an existing element which matches a hit @@ -76,8 +76,8 @@ function isPreferred(existingHit, candidateHit) { _.has(candidateHit, 'address_parts.zip') ){ return true; } // prefer non-canonical sources over canonical ones - if( !_.includes(canonicalSources, candidateHit.source) && - _.includes(canonicalSources, existingHit.source) ){ return true; } + if( !_.includes(canonical_sources, candidateHit.source) && + _.includes(canonical_sources, existingHit.source) ){ return true; } // prefer certain sources over others switch( existingHit.source ){ diff --git a/package.json b/package.json index 80e5f79d..25104b7a 100644 --- a/package.json +++ b/package.json @@ -52,7 +52,7 @@ "markdown": "^0.5.0", "morgan": "^1.8.2", "pelias-categories": "^1.2.0", - "pelias-config": "^3.0.2", + "pelias-config": "^3.7.0", "pelias-labels": "^1.8.0", "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", From a31f1a8561f34b29b2a31597c97464bc42737e3a Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Mon, 29 Oct 2018 21:15:03 -0400 Subject: [PATCH 4/6] Add failing test case for one postcode deduping --- test/unit/middleware/dedupe.js | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index 674a2b10..b1b6e9ef 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -1,6 +1,7 @@ var data = require('../fixture/dedupe_elasticsearch_results'); var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); var customLayerData = require('../fixture/dedupe_elasticsearch_custom_layer_results'); +var onlyPostalcodeDiffers = require('../fixture/dedupe_only_postalcode_differs'); var dedupe = require('../../../middleware/dedupe')(); module.exports.tests = {}; @@ -75,6 +76,24 @@ module.exports.tests.dedupe = function(test, common) { t.end(); }); }); + + test('test records with no address except one has postalcode', function(t) { + var req = { + clean: { + size: 20 + } + }; + var res = { + data: onlyPostalcodeDiffers + }; + var expected = onlyPostalcodeDiffers[1]; // non-canonical record + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'only one result displayed'); + t.equal(res.data[0], expected, 'record with postalcode is preferred'); + t.end(); + }); + }); }; From f153e543c4d5067acdc33ebca5bae92ed5ebde34 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Mon, 29 Oct 2018 21:15:03 -0400 Subject: [PATCH 5/6] Add failing test case for one postcode deduping --- .../fixture/dedupe_only_postalcode_differs.js | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 test/unit/fixture/dedupe_only_postalcode_differs.js diff --git a/test/unit/fixture/dedupe_only_postalcode_differs.js b/test/unit/fixture/dedupe_only_postalcode_differs.js new file mode 100644 index 00000000..2cbeb4d6 --- /dev/null +++ b/test/unit/fixture/dedupe_only_postalcode_differs.js @@ -0,0 +1,27 @@ +module.exports = [ + { + '_id': '101914069', + 'layer': 'venue', + 'source': 'openstreetmap', + 'name': { + 'default': 'A place' + }, + 'parent': { + 'country_a': ['USA'] + } + }, + { + '_id': '323', + 'layer': 'venue', + 'source': 'openstreetmap', + 'name': { + 'default': 'A place' + }, + 'address_parts': { + 'zip': '97005' + }, + 'parent': { + 'country_a': ['USA'] + } + } +]; From b2069606f20ea03940d6ebe08a8f81f4a571ddaa Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 30 Oct 2018 15:59:38 +0100 Subject: [PATCH 6/6] feat(dedupe): improved handling of cases where "name", "parent" or "address_parts" propertiees are not set --- helper/diffPlaces.js | 93 ++++++++++++++++++---------------- test/unit/middleware/dedupe.js | 6 +-- 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 0f327bfb..92b9ea51 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -27,25 +27,27 @@ function isParentHierarchyDifferent(item1, item2){ let parent1 = _.get(item1, 'parent'); let parent2 = _.get(item2, 'parent'); - // if neither object has parent info, we consider them the same - if( !parent1 && !parent2 ){ return false; } + // check if these are plain 'ol javascript objects + let isPojo1 = _.isPlainObject(parent1); + let isPojo2 = _.isPlainObject(parent2); - // both have parent info - if( _.isPlainObject(parent1) && _.isPlainObject(parent2) ){ + // if neither object has parent info, we consider them the same + if( !isPojo1 && !isPojo2 ){ return false; } - // iterate over all the placetypes, comparing between items - return placeTypes.some( placeType => { + // if only one has parent info, we consider them the same + // note: this really shouldn't happen as at least on parent should exist + if( !isPojo1 || !isPojo2 ){ return false; } - // skip the parent field corresponding to the item placetype - if( placeType === item1.layer ){ return false; } + // else both have parent info + // iterate over all the placetypes, comparing between items + return placeTypes.some( placeType => { - // ensure the parent ids are the same for all placetypes - return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' ); - }); - } + // skip the parent field corresponding to the item placetype + if( placeType === item1.layer ){ return false; } - // if one has parent info and the other doesn't, we consider them different - return true; + // ensure the parent ids are the same for all placetypes + return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' ); + }); } /** @@ -56,27 +58,29 @@ function isNameDifferent(item1, item2){ let names1 = _.get(item1, 'name'); let names2 = _.get(item2, 'name'); - // if neither object has name info, we consider them the same - if( !names1 && !names2 ){ return false; } + // check if these are plain 'ol javascript objects + let isPojo1 = _.isPlainObject(names1); + let isPojo2 = _.isPlainObject(names2); - // if both have name info - if( _.isPlainObject(names1) && _.isPlainObject(names2) ){ + // if neither object has name info, we consider them the same + if( !isPojo1 && !isPojo2 ){ return false; } - // iterate over all the languages in item1, comparing between items - return Object.keys(names1).some( lang => { + // if only one has name info, we consider them the same + // note: this really shouldn't happen as name is a mandatory field + if( !isPojo1 || !isPojo2 ){ return false; } - // do not consider absence of an additional name as a difference - // but strictly enfore that 'default' must be present and match - if( _.has(names2, lang) || lang === 'default' ){ + // else both have name info + // iterate over all the languages in item1, comparing between items + return Object.keys(names1).some( lang => { - // do not consider absence of an additional name as a difference - return isPropertyDifferent(names1, names2, lang); - } - }); - } + // do not consider absence of an additional name as a difference + // but strictly enfore that 'default' must be present and match + if( _.has(names2, lang) || lang === 'default' ){ - // if one has name info and the other doesn't, we consider them different - return true; + // do not consider absence of an additional name as a difference + return isPropertyDifferent(names1, names2, lang); + } + }); } /** @@ -87,26 +91,27 @@ function isAddressDifferent(item1, item2){ let address1 = _.get(item1, 'address_parts'); let address2 = _.get(item2, 'address_parts'); - // if neither object has address info, we consider them the same - if( !address1 && !address2 ){ return false; } + // check if these are plain 'ol javascript objects + let isPojo1 = _.isPlainObject(address1); + let isPojo2 = _.isPlainObject(address2); - // if both have address info - if( _.isPlainObject(address1) && _.isPlainObject(address2) ){ + // if neither object has address info, we consider them the same + if( !isPojo1 && !isPojo2 ){ return false; } - if( isPropertyDifferent(address1, address2, 'number') ){ return true; } - if( isPropertyDifferent(address1, address2, 'street') ){ return true; } + // if only one has address info, we consider them the same + if( !isPojo1 || !isPojo2 ){ return false; } - // only compare zip if both records have it, otherwise just ignore and assume it's the same - // since by this time we've already compared parent hierarchies - if( _.has(address1, 'zip') && _.has(address2, 'zip') ){ - if( isPropertyDifferent(address1, address2, 'zip') ){ return true; } - } + // else both have address info + if( isPropertyDifferent(address1, address2, 'number') ){ return true; } + if( isPropertyDifferent(address1, address2, 'street') ){ return true; } - return false; + // only compare zip if both records have it, otherwise just ignore and assume it's the same + // since by this time we've already compared parent hierarchies + if( _.has(address1, 'zip') && _.has(address2, 'zip') ){ + if( isPropertyDifferent(address1, address2, 'zip') ){ return true; } } - // one has address and the other doesn't, different! - return true; + return false; } /** diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index b1b6e9ef..63cb3373 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -1,7 +1,7 @@ var data = require('../fixture/dedupe_elasticsearch_results'); var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); var customLayerData = require('../fixture/dedupe_elasticsearch_custom_layer_results'); -var onlyPostalcodeDiffers = require('../fixture/dedupe_only_postalcode_differs'); +var onlyPostalcodeDiffersData = require('../fixture/dedupe_only_postalcode_differs'); var dedupe = require('../../../middleware/dedupe')(); module.exports.tests = {}; @@ -84,9 +84,9 @@ module.exports.tests.dedupe = function(test, common) { } }; var res = { - data: onlyPostalcodeDiffers + data: onlyPostalcodeDiffersData }; - var expected = onlyPostalcodeDiffers[1]; // non-canonical record + var expected = onlyPostalcodeDiffersData[1]; // record with postcode dedupe(req, res, function () { t.equal(res.data.length, 1, 'only one result displayed');