From 14e6c6303f9419ee69d885812ed43f3f914fdb94 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 29 Oct 2018 17:13:52 +0100 Subject: [PATCH] refactor middleware dedupe for readability --- helper/diffPlaces.js | 206 +++++++++++++++++-------------------------- middleware/dedupe.js | 135 ++++++++++++++-------------- 2 files changed, 153 insertions(+), 188 deletions(-) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 712c7959..ea15bd79 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -1,176 +1,136 @@ -var _ = require('lodash'); -var placeTypes = require('./placeTypes'); +const _ = require('lodash'); +const placeTypes = require('./placeTypes'); +const field = require('../helper/fieldValue'); /** * Compare the layer properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Returns false if the objects are the same, else true. */ -function assertLayerMatch(item1, item2) { - if (item1.layer === item2.layer) { - return false; - } - - throw new Error('different'); +function isLayerDifferent(item1, item2){ + return isPropertyDifferent(item1, item2, 'layer'); } /** - * Compare the parent.*_id properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Compare the parent properties if they exist. + * Returns false if the objects are the same, else true. */ -function assertParentHierarchyMatch(item1, item2) { - // if neither object has parent, assume same - if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) { - return false; - } +function isParentHierarchyDifferent(item1, item2){ + let parent1 = _.get(item1, 'parent'); + let parent2 = _.get(item2, 'parent'); - // if both have parent, do the rest of the checking - if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { - placeTypes.forEach(function (placeType) { - // don't consider its own id - if (placeType === item1.layer) { - return; - } - propMatch(item1.parent, item2.parent, placeType + '_id'); + // if neither object has parent info, we consider them the same + if( !parent1 && !parent2 ){ return false; } + + // both have parent info + if( _.isPlainObject(parent1) && _.isPlainObject(parent2) ){ + + // iterate over all the placetypes, comparing between items + return placeTypes.some( placeType => { + + // skip the parent field corresponding to the item placetype + if( placeType === item1.layer ){ return false; } + + // ensure the parent ids are the same for all placetypes + return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' ); }); - return false; } - // if one has parent and the other doesn't consider different - throw new Error('different'); + // if one has parent info and the other doesn't, we consider them different + return true; } /** - * Compare the name.* properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Compare the name properties if they exist. + * Returns false if the objects are the same, else true. */ -function assertNameMatch(item1, item2) { - if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { - for (var lang in item1.name) { - if(item2.name.hasOwnProperty(lang) || lang === 'default') { +function isNameDifferent(item1, item2){ + let names1 = _.get(item1, 'name'); + let names2 = _.get(item2, 'name'); + + // if neither object has name info, we consider them the same + if( !names1 && !names2 ){ return false; } + + // if both have name info + if( _.isPlainObject(names1) && _.isPlainObject(names2) ){ + + // iterate over all the languages in item1, comparing between items + return Object.keys(names1).some( lang => { + + // do not consider absence of an additional name as a difference + // but strictly enfore that 'default' must be present and match + if( _.has(names2, lang) || lang === 'default' ){ + // do not consider absence of an additional name as a difference - propMatch(item1.name, item2.name, lang); + return isPropertyDifferent(names1, names2, lang); } - } - } - else { - propMatch(item1, item2, 'name'); + }); } + + // if one has name info and the other doesn't, we consider them different + return true; } /** * Compare the address_parts properties if they exist. - * Returns false if the objects are the same, and throws - * an exception with the message 'different' if not. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} + * Returns false if the objects are the same, else true. */ -function assertAddressMatch(item1, item2) { - // if neither record has address, assume same - if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) { - return false; - } +function isAddressDifferent(item1, item2){ + let address1 = _.get(item1, 'address_parts'); + let address2 = _.get(item2, 'address_parts'); - // if both have address, check parts - if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) { - propMatch(item1.address_parts, item2.address_parts, 'number'); - propMatch(item1.address_parts, item2.address_parts, 'street'); + // if neither object has address info, we consider them the same + if( !address1 && !address2 ){ return false; } + + // if both have address info + if( _.isPlainObject(address1) && _.isPlainObject(address2) ){ + + if( isPropertyDifferent(address1, address2, 'number') ){ return true; } + if( isPropertyDifferent(address1, address2, 'street') ){ return true; } // only compare zip if both records have it, otherwise just ignore and assume it's the same // since by this time we've already compared parent hierarchies - if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) { - propMatch(item1.address_parts, item2.address_parts, 'zip'); + if( _.has(address1, 'zip') && _.has(address2, 'zip') ){ + if( isPropertyDifferent(address1, address2, 'zip') ){ return true; } } return false; } // one has address and the other doesn't, different! - throw new Error('different'); + return true; } /** * Compare the two records and return true if they differ and false if same. - * - * @param {object} item1 - * @param {object} item2 - * @returns {boolean} - * @throws {Error} */ -function isDifferent(item1, item2) { - try { - assertLayerMatch(item1, item2); - assertParentHierarchyMatch(item1, item2); - assertNameMatch(item1, item2); - assertAddressMatch(item1, item2); - } - catch (err) { - if (err.message === 'different') { - return true; - } - throw err; - } - +function isDifferent(item1, item2){ + if( isLayerDifferent( item1, item2 ) ){ return true; } + if( isParentHierarchyDifferent( item1, item2 ) ){ return true; } + if( isNameDifferent( item1, item2 ) ){ return true; } + if( isAddressDifferent( item1, item2 ) ){ return true; } return false; } /** - * Throw exception if properties are different - * - * @param {object} item1 - * @param {object} item2 - * @param {string} prop - * @throws {Error} + * return true if properties are different */ -function propMatch(item1, item2, prop) { - var prop1 = item1[prop]; - var prop2 = item2[prop]; +function isPropertyDifferent(item1, item2, prop ){ - // in the case the property is an array (currently only in parent schema) - // simply take the 1st item. this will change in the near future to support multiple hierarchies - if (_.isArray(prop1)) { prop1 = prop1[0]; } - if (_.isArray(prop2)) { prop2 = prop2[0]; } + // if neither item has prop, we consider them the same + if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; } - if (normalizeString(prop1) !== normalizeString(prop2)) { - throw new Error('different'); - } + // handle arrays and other non-string values + var prop1 = field.getStringValue( _.get( item1, prop ) ); + var prop2 = field.getStringValue( _.get( item2, prop ) ); + + // compare strings + return normalizeString(prop1) !== normalizeString(prop2); } /** - * Remove punctuation and lowercase - * - * @param {string} str - * @returns {string} + * lowercase characters and remove some punctuation */ -function normalizeString(str) { - if (!_.isString(str)) { - return str; - } - - if (_.isEmpty(str)) { - return ''; - } - +function normalizeString(str){ return str.toLowerCase().split(/[ ,-]+/).join(' '); } diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 4b637ed5..ba9e70e7 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -3,87 +3,92 @@ const _ = require('lodash'); const isDifferent = require('../helper/diffPlaces').isDifferent; const field = require('../helper/fieldValue'); -function setup() { - return dedupeResults; -} - function dedupeResults(req, res, next) { - // do nothing if no result data set - if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) { - return next(); - } - // loop through data items and only copy unique items to uniqueResults - var uniqueResults = []; + // do nothing if request data is invalid + if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); } + + // do nothing if no result data is invalid + if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); } + + // loop through data items and only copy unique items to unique + // note: the first reqults must always be unique! + let unique = [ res.data[0] ]; + + // convenience function to search unique array for an existing element which matches a hit + let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit)); + + // iterate over res.data using an old-school for loop starting at index 1 + // we can call break at any time to end the iterator + for( let i=1; i= req.clean.size ){ break; } + } - res.data = uniqueResults; + // replace the original data with only the unique hits + res.data = unique; next(); } -function isPreferred(existing, candidateReplacement) { - // NOTE: we are assuming here that the layer for both records is the same - const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip'); +// return true if the second argument represents a hit which is preferred +// to the hit in the first argument +function isPreferred(existingHit, candidateHit) { + // prefer a record with a postcode // https://github.com/pelias/api/issues/872 - const candidateHasZip = hasZip(candidateReplacement); - const existingHasZip = hasZip(existing); - if (candidateHasZip !== existingHasZip) { - return candidateHasZip; - } + if( !_.has(existingHit, 'address_parts.zip') && + _.has(candidateHit, 'address_parts.zip') ){ return true; } - //bind the trumps function to the data items to keep the rest of the function clean - var trumpsFunc = trumps.bind(null, existing, candidateReplacement); - - return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred - trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA - trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time -} - -function trumps(existing, candidateReplacement, loserSource, winnerSource) { - return existing.source === loserSource && candidateReplacement.source === winnerSource; + // prefer certain sources over others + switch( existingHit.source ){ + // sources are the same + case candidateHit.source: return false; + // WOF has bbox and is generally preferred + case 'geonames': return candidateHit.source === 'whosonfirst'; + // addresses are generally better in OA + case 'openstreetmap': return candidateHit.source === 'openaddresses'; + // venues are better in OSM + case 'whosonfirst': return candidateHit.source === 'openstreetmap'; + // no preference, keep existing hit + default: return false; + } } -module.exports = setup; +module.exports = function() { + return dedupeResults; +};