Browse Source

refactor middleware dedupe for readability

dedupe-failing-test-case
Peter Johnson 6 years ago committed by Julian Simioni
parent
commit
c458ea74d8
No known key found for this signature in database
GPG Key ID: B9EEB0C6EE0910A1
  1. 202
      helper/diffPlaces.js
  2. 107
      middleware/dedupe.js

202
helper/diffPlaces.js

@ -1,176 +1,136 @@
var _ = require('lodash'); const _ = require('lodash');
var placeTypes = require('./placeTypes'); const placeTypes = require('./placeTypes');
const field = require('../helper/fieldValue');
/** /**
* Compare the layer properties if they exist. * Compare the layer properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertLayerMatch(item1, item2) { function isLayerDifferent(item1, item2){
if (item1.layer === item2.layer) { return isPropertyDifferent(item1, item2, 'layer');
return false;
}
throw new Error('different');
} }
/** /**
* Compare the parent.*_id properties if they exist. * Compare the parent properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertParentHierarchyMatch(item1, item2) { function isParentHierarchyDifferent(item1, item2){
// if neither object has parent, assume same let parent1 = _.get(item1, 'parent');
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) { let parent2 = _.get(item2, 'parent');
return false;
}
// if both have parent, do the rest of the checking // if neither object has parent info, we consider them the same
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { if( !parent1 && !parent2 ){ return false; }
placeTypes.forEach(function (placeType) {
// don't consider its own id // both have parent info
if (placeType === item1.layer) { if( _.isPlainObject(parent1) && _.isPlainObject(parent2) ){
return;
} // iterate over all the placetypes, comparing between items
propMatch(item1.parent, item2.parent, placeType + '_id'); return placeTypes.some( placeType => {
// skip the parent field corresponding to the item placetype
if( placeType === item1.layer ){ return false; }
// ensure the parent ids are the same for all placetypes
return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' );
}); });
return false;
} }
// if one has parent and the other doesn't consider different // if one has parent info and the other doesn't, we consider them different
throw new Error('different'); return true;
} }
/** /**
* Compare the name.* properties if they exist. * Compare the name properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertNameMatch(item1, item2) { function isNameDifferent(item1, item2){
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { let names1 = _.get(item1, 'name');
for (var lang in item1.name) { let names2 = _.get(item2, 'name');
if(item2.name.hasOwnProperty(lang) || lang === 'default') {
// if neither object has name info, we consider them the same
if( !names1 && !names2 ){ return false; }
// if both have name info
if( _.isPlainObject(names1) && _.isPlainObject(names2) ){
// iterate over all the languages in item1, comparing between items
return Object.keys(names1).some( lang => {
// do not consider absence of an additional name as a difference // do not consider absence of an additional name as a difference
propMatch(item1.name, item2.name, lang); // but strictly enfore that 'default' must be present and match
} if( _.has(names2, lang) || lang === 'default' ){
}
// do not consider absence of an additional name as a difference
return isPropertyDifferent(names1, names2, lang);
} }
else { });
propMatch(item1, item2, 'name');
} }
// if one has name info and the other doesn't, we consider them different
return true;
} }
/** /**
* Compare the address_parts properties if they exist. * Compare the address_parts properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertAddressMatch(item1, item2) { function isAddressDifferent(item1, item2){
// if neither record has address, assume same let address1 = _.get(item1, 'address_parts');
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) { let address2 = _.get(item2, 'address_parts');
return false;
}
// if both have address, check parts // if neither object has address info, we consider them the same
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) { if( !address1 && !address2 ){ return false; }
propMatch(item1.address_parts, item2.address_parts, 'number');
propMatch(item1.address_parts, item2.address_parts, 'street'); // if both have address info
if( _.isPlainObject(address1) && _.isPlainObject(address2) ){
if( isPropertyDifferent(address1, address2, 'number') ){ return true; }
if( isPropertyDifferent(address1, address2, 'street') ){ return true; }
// only compare zip if both records have it, otherwise just ignore and assume it's the same // only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies // since by this time we've already compared parent hierarchies
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) { if( _.has(address1, 'zip') && _.has(address2, 'zip') ){
propMatch(item1.address_parts, item2.address_parts, 'zip'); if( isPropertyDifferent(address1, address2, 'zip') ){ return true; }
} }
return false; return false;
} }
// one has address and the other doesn't, different! // one has address and the other doesn't, different!
throw new Error('different'); return true;
} }
/** /**
* Compare the two records and return true if they differ and false if same. * Compare the two records and return true if they differ and false if same.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function isDifferent(item1, item2){ function isDifferent(item1, item2){
try { if( isLayerDifferent( item1, item2 ) ){ return true; }
assertLayerMatch(item1, item2); if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
assertParentHierarchyMatch(item1, item2); if( isNameDifferent( item1, item2 ) ){ return true; }
assertNameMatch(item1, item2); if( isAddressDifferent( item1, item2 ) ){ return true; }
assertAddressMatch(item1, item2);
}
catch (err) {
if (err.message === 'different') {
return true;
}
throw err;
}
return false; return false;
} }
/** /**
* Throw exception if properties are different * return true if properties are different
*
* @param {object} item1
* @param {object} item2
* @param {string} prop
* @throws {Error}
*/ */
function propMatch(item1, item2, prop) { function isPropertyDifferent(item1, item2, prop ){
var prop1 = item1[prop];
var prop2 = item2[prop];
// in the case the property is an array (currently only in parent schema) // if neither item has prop, we consider them the same
// simply take the 1st item. this will change in the near future to support multiple hierarchies if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }
if (_.isArray(prop1)) { prop1 = prop1[0]; }
if (_.isArray(prop2)) { prop2 = prop2[0]; }
if (normalizeString(prop1) !== normalizeString(prop2)) { // handle arrays and other non-string values
throw new Error('different'); var prop1 = field.getStringValue( _.get( item1, prop ) );
} var prop2 = field.getStringValue( _.get( item2, prop ) );
// compare strings
return normalizeString(prop1) !== normalizeString(prop2);
} }
/** /**
* Remove punctuation and lowercase * lowercase characters and remove some punctuation
*
* @param {string} str
* @returns {string}
*/ */
function normalizeString(str){ function normalizeString(str){
if (!_.isString(str)) {
return str;
}
if (_.isEmpty(str)) {
return '';
}
return str.toLowerCase().split(/[ ,-]+/).join(' '); return str.toLowerCase().split(/[ ,-]+/).join(' ');
} }

107
middleware/dedupe.js

@ -3,87 +3,92 @@ const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent; const isDifferent = require('../helper/diffPlaces').isDifferent;
const field = require('../helper/fieldValue'); const field = require('../helper/fieldValue');
function setup() {
return dedupeResults;
}
function dedupeResults(req, res, next) { function dedupeResults(req, res, next) {
// do nothing if no result data set
if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) {
return next();
}
// loop through data items and only copy unique items to uniqueResults // do nothing if request data is invalid
var uniqueResults = []; if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); }
_.some(res.data, function (hit) { // do nothing if no result data is invalid
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }
// loop through data items and only copy unique items to unique
// note: the first reqults must always be unique!
let unique = [ res.data[0] ];
// convenience function to search unique array for an existing element which matches a hit
let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit));
// iterate over res.data using an old-school for loop starting at index 1
// we can call break at any time to end the iterator
for( let i=1; i<res.data.length; i++){
let hit = res.data[i];
if (_.isEmpty(uniqueResults)) {
uniqueResults.push(hit);
}
else {
// if there are multiple items in results, loop through them to find a dupe // if there are multiple items in results, loop through them to find a dupe
// save off the index of the dupe if found // save off the index of the dupe if found
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) { let dupeIndex = findMatch(hit);
return !isDifferent(elem, hit);
}); // if a dupe is not found, just add to list of unique hits and continue
if( dupeIndex === -1 ){ unique.push(hit); }
// if a dupe is not found, just add to results and move on
if (dupeIndex === -1) {
uniqueResults.push(hit);
}
// if dupe was found, we need to check which of the records is preferred // if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary // since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new // of course, if the new one is preferred we should replace previous with new
else if (isPreferred(uniqueResults[dupeIndex], hit)) { else if( isPreferred( unique[dupeIndex], hit ) ) {
// replace previous dupe item with current hit
unique[dupeIndex] = hit;
// logging
logger.debug('[dupe][replacing]', { logger.debug('[dupe][replacing]', {
query: req.clean.text, query: req.clean.text,
previous: uniqueResults[dupeIndex].source, previous: unique[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
}); });
// replace previous dupe item with current hit
uniqueResults[dupeIndex] = hit;
} }
// if not preferred over existing, just log and move on
// if not preferred over existing, just log and continue
else { else {
logger.debug('[dupe][skipping]', { logger.debug('[dupe][skipping]', {
query: req.clean.text, query: req.clean.text,
previous: uniqueResults[dupeIndex].source, previous: unique[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
}); });
} }
}
// stop looping when requested size has been reached in uniqueResults // stop iterating when requested size has been reached in unique
return req.clean.size <= uniqueResults.length; if( unique.length >= req.clean.size ){ break; }
}); }
res.data = uniqueResults; // replace the original data with only the unique hits
res.data = unique;
next(); next();
} }
function isPreferred(existing, candidateReplacement) { // return true if the second argument represents a hit which is preferred
// NOTE: we are assuming here that the layer for both records is the same // to the hit in the first argument
const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip'); function isPreferred(existingHit, candidateHit) {
// prefer a record with a postcode
// https://github.com/pelias/api/issues/872 // https://github.com/pelias/api/issues/872
const candidateHasZip = hasZip(candidateReplacement); if( !_.has(existingHit, 'address_parts.zip') &&
const existingHasZip = hasZip(existing); _.has(candidateHit, 'address_parts.zip') ){ return true; }
if (candidateHasZip !== existingHasZip) {
return candidateHasZip;
}
//bind the trumps function to the data items to keep the rest of the function clean // prefer certain sources over others
var trumpsFunc = trumps.bind(null, existing, candidateReplacement); switch( existingHit.source ){
// sources are the same
return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred case candidateHit.source: return false;
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA // WOF has bbox and is generally preferred
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time case 'geonames': return candidateHit.source === 'whosonfirst';
// addresses are generally better in OA
case 'openstreetmap': return candidateHit.source === 'openaddresses';
// venues are better in OSM
case 'whosonfirst': return candidateHit.source === 'openstreetmap';
// no preference, keep existing hit
default: return false;
} }
function trumps(existing, candidateReplacement, loserSource, winnerSource) {
return existing.source === loserSource && candidateReplacement.source === winnerSource;
} }
module.exports = setup; module.exports = function() {
return dedupeResults;
};

Loading…
Cancel
Save