Browse Source

refactor middleware dedupe for readability

dedupe-failing-test-case
Peter Johnson 6 years ago committed by Julian Simioni
parent
commit
c458ea74d8
No known key found for this signature in database
GPG Key ID: B9EEB0C6EE0910A1
  1. 206
      helper/diffPlaces.js
  2. 135
      middleware/dedupe.js

206
helper/diffPlaces.js

@ -1,176 +1,136 @@
var _ = require('lodash');
var placeTypes = require('./placeTypes');
const _ = require('lodash');
const placeTypes = require('./placeTypes');
const field = require('../helper/fieldValue');
/**
* Compare the layer properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Returns false if the objects are the same, else true.
*/
function assertLayerMatch(item1, item2) {
if (item1.layer === item2.layer) {
return false;
}
throw new Error('different');
function isLayerDifferent(item1, item2){
return isPropertyDifferent(item1, item2, 'layer');
}
/**
* Compare the parent.*_id properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Compare the parent properties if they exist.
* Returns false if the objects are the same, else true.
*/
function assertParentHierarchyMatch(item1, item2) {
// if neither object has parent, assume same
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) {
return false;
}
function isParentHierarchyDifferent(item1, item2){
let parent1 = _.get(item1, 'parent');
let parent2 = _.get(item2, 'parent');
// if both have parent, do the rest of the checking
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) {
placeTypes.forEach(function (placeType) {
// don't consider its own id
if (placeType === item1.layer) {
return;
}
propMatch(item1.parent, item2.parent, placeType + '_id');
// if neither object has parent info, we consider them the same
if( !parent1 && !parent2 ){ return false; }
// both have parent info
if( _.isPlainObject(parent1) && _.isPlainObject(parent2) ){
// iterate over all the placetypes, comparing between items
return placeTypes.some( placeType => {
// skip the parent field corresponding to the item placetype
if( placeType === item1.layer ){ return false; }
// ensure the parent ids are the same for all placetypes
return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' );
});
return false;
}
// if one has parent and the other doesn't consider different
throw new Error('different');
// if one has parent info and the other doesn't, we consider them different
return true;
}
/**
* Compare the name.* properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Compare the name properties if they exist.
* Returns false if the objects are the same, else true.
*/
function assertNameMatch(item1, item2) {
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) {
for (var lang in item1.name) {
if(item2.name.hasOwnProperty(lang) || lang === 'default') {
function isNameDifferent(item1, item2){
let names1 = _.get(item1, 'name');
let names2 = _.get(item2, 'name');
// if neither object has name info, we consider them the same
if( !names1 && !names2 ){ return false; }
// if both have name info
if( _.isPlainObject(names1) && _.isPlainObject(names2) ){
// iterate over all the languages in item1, comparing between items
return Object.keys(names1).some( lang => {
// do not consider absence of an additional name as a difference
// but strictly enfore that 'default' must be present and match
if( _.has(names2, lang) || lang === 'default' ){
// do not consider absence of an additional name as a difference
propMatch(item1.name, item2.name, lang);
return isPropertyDifferent(names1, names2, lang);
}
}
}
else {
propMatch(item1, item2, 'name');
});
}
// if one has name info and the other doesn't, we consider them different
return true;
}
/**
* Compare the address_parts properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Returns false if the objects are the same, else true.
*/
function assertAddressMatch(item1, item2) {
// if neither record has address, assume same
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) {
return false;
}
function isAddressDifferent(item1, item2){
let address1 = _.get(item1, 'address_parts');
let address2 = _.get(item2, 'address_parts');
// if both have address, check parts
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) {
propMatch(item1.address_parts, item2.address_parts, 'number');
propMatch(item1.address_parts, item2.address_parts, 'street');
// if neither object has address info, we consider them the same
if( !address1 && !address2 ){ return false; }
// if both have address info
if( _.isPlainObject(address1) && _.isPlainObject(address2) ){
if( isPropertyDifferent(address1, address2, 'number') ){ return true; }
if( isPropertyDifferent(address1, address2, 'street') ){ return true; }
// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) {
propMatch(item1.address_parts, item2.address_parts, 'zip');
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; }
}
return false;
}
// one has address and the other doesn't, different!
throw new Error('different');
return true;
}
/**
* Compare the two records and return true if they differ and false if same.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDifferent(item1, item2) {
try {
assertLayerMatch(item1, item2);
assertParentHierarchyMatch(item1, item2);
assertNameMatch(item1, item2);
assertAddressMatch(item1, item2);
}
catch (err) {
if (err.message === 'different') {
return true;
}
throw err;
}
function isDifferent(item1, item2){
if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2 ) ){ return true; }
if( isAddressDifferent( item1, item2 ) ){ return true; }
return false;
}
/**
* Throw exception if properties are different
*
* @param {object} item1
* @param {object} item2
* @param {string} prop
* @throws {Error}
* return true if properties are different
*/
function propMatch(item1, item2, prop) {
var prop1 = item1[prop];
var prop2 = item2[prop];
function isPropertyDifferent(item1, item2, prop ){
// in the case the property is an array (currently only in parent schema)
// simply take the 1st item. this will change in the near future to support multiple hierarchies
if (_.isArray(prop1)) { prop1 = prop1[0]; }
if (_.isArray(prop2)) { prop2 = prop2[0]; }
// if neither item has prop, we consider them the same
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }
if (normalizeString(prop1) !== normalizeString(prop2)) {
throw new Error('different');
}
// handle arrays and other non-string values
var prop1 = field.getStringValue( _.get( item1, prop ) );
var prop2 = field.getStringValue( _.get( item2, prop ) );
// compare strings
return normalizeString(prop1) !== normalizeString(prop2);
}
/**
* Remove punctuation and lowercase
*
* @param {string} str
* @returns {string}
* lowercase characters and remove some punctuation
*/
function normalizeString(str) {
if (!_.isString(str)) {
return str;
}
if (_.isEmpty(str)) {
return '';
}
function normalizeString(str){
return str.toLowerCase().split(/[ ,-]+/).join(' ');
}

135
middleware/dedupe.js

@ -3,87 +3,92 @@ const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent;
const field = require('../helper/fieldValue');
function setup() {
return dedupeResults;
}
function dedupeResults(req, res, next) {
// do nothing if no result data set
if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) {
return next();
}
// loop through data items and only copy unique items to uniqueResults
var uniqueResults = [];
// do nothing if request data is invalid
if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); }
// do nothing if no result data is invalid
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }
// loop through data items and only copy unique items to unique
// note: the first reqults must always be unique!
let unique = [ res.data[0] ];
// convenience function to search unique array for an existing element which matches a hit
let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit));
// iterate over res.data using an old-school for loop starting at index 1
// we can call break at any time to end the iterator
for( let i=1; i<res.data.length; i++){
let hit = res.data[i];
// if there are multiple items in results, loop through them to find a dupe
// save off the index of the dupe if found
let dupeIndex = findMatch(hit);
// if a dupe is not found, just add to list of unique hits and continue
if( dupeIndex === -1 ){ unique.push(hit); }
// if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new
else if( isPreferred( unique[dupeIndex], hit ) ) {
_.some(res.data, function (hit) {
// replace previous dupe item with current hit
unique[dupeIndex] = hit;
if (_.isEmpty(uniqueResults)) {
uniqueResults.push(hit);
// logging
logger.debug('[dupe][replacing]', {
query: req.clean.text,
previous: unique[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
}
// if not preferred over existing, just log and continue
else {
// if there are multiple items in results, loop through them to find a dupe
// save off the index of the dupe if found
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) {
return !isDifferent(elem, hit);
logger.debug('[dupe][skipping]', {
query: req.clean.text,
previous: unique[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
// if a dupe is not found, just add to results and move on
if (dupeIndex === -1) {
uniqueResults.push(hit);
}
// if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new
else if (isPreferred(uniqueResults[dupeIndex], hit)) {
logger.debug('[dupe][replacing]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
// replace previous dupe item with current hit
uniqueResults[dupeIndex] = hit;
}
// if not preferred over existing, just log and move on
else {
logger.debug('[dupe][skipping]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
}
}
// stop looping when requested size has been reached in uniqueResults
return req.clean.size <= uniqueResults.length;
});
// stop iterating when requested size has been reached in unique
if( unique.length >= req.clean.size ){ break; }
}
res.data = uniqueResults;
// replace the original data with only the unique hits
res.data = unique;
next();
}
function isPreferred(existing, candidateReplacement) {
// NOTE: we are assuming here that the layer for both records is the same
const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip');
// return true if the second argument represents a hit which is preferred
// to the hit in the first argument
function isPreferred(existingHit, candidateHit) {
// prefer a record with a postcode
// https://github.com/pelias/api/issues/872
const candidateHasZip = hasZip(candidateReplacement);
const existingHasZip = hasZip(existing);
if (candidateHasZip !== existingHasZip) {
return candidateHasZip;
}
if( !_.has(existingHit, 'address_parts.zip') &&
_.has(candidateHit, 'address_parts.zip') ){ return true; }
//bind the trumps function to the data items to keep the rest of the function clean
var trumpsFunc = trumps.bind(null, existing, candidateReplacement);
return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time
}
function trumps(existing, candidateReplacement, loserSource, winnerSource) {
return existing.source === loserSource && candidateReplacement.source === winnerSource;
// prefer certain sources over others
switch( existingHit.source ){
// sources are the same
case candidateHit.source: return false;
// WOF has bbox and is generally preferred
case 'geonames': return candidateHit.source === 'whosonfirst';
// addresses are generally better in OA
case 'openstreetmap': return candidateHit.source === 'openaddresses';
// venues are better in OSM
case 'whosonfirst': return candidateHit.source === 'openstreetmap';
// no preference, keep existing hit
default: return false;
}
}
module.exports = setup;
module.exports = function() {
return dedupeResults;
};

Loading…
Cancel
Save