Browse Source

Merge pull request #1222 from pelias/dedupe

refactor dedupe middleware
master v3.37.0
Julian Simioni 6 years ago committed by GitHub
parent
commit
3535f9eae1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 22
      helper/TypeMapping.js
  2. 236
      helper/diffPlaces.js
  3. 138
      middleware/dedupe.js
  4. 2
      package.json
  5. 32
      test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js
  6. 27
      test/unit/fixture/dedupe_only_postalcode_differs.js
  7. 53
      test/unit/middleware/dedupe.js

22
helper/TypeMapping.js

@ -26,6 +26,11 @@ var TypeMapping = function(){
*/ */
this.layer_aliases = {}; this.layer_aliases = {};
/*
* A list of the canonical sources included in the default Pelias configuration
*/
this.canonical_sources = [];
/* /*
* An object that contains all sources or aliases. The key is the source or alias, * An object that contains all sources or aliases. The key is the source or alias,
* the value is either that source, or the canonical name for that alias if it's an alias. * the value is either that source, or the canonical name for that alias if it's an alias.
@ -65,6 +70,11 @@ TypeMapping.prototype.setLayerAliases = function( aliases ){
this.layer_aliases = aliases; this.layer_aliases = aliases;
}; };
// canonical sources setter
TypeMapping.prototype.setCanonicalSources = function( sources ){
this.canonical_sources = sources;
};
// generate mappings after setters have been run // generate mappings after setters have been run
TypeMapping.prototype.generateMappings = function(){ TypeMapping.prototype.generateMappings = function(){
this.sources = Object.keys( this.layers_by_source ); this.sources = Object.keys( this.layers_by_source );
@ -75,6 +85,17 @@ TypeMapping.prototype.generateMappings = function(){
this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases); this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases);
}; };
// generate a list of all layers which are part of the canonical Pelias configuration
TypeMapping.prototype.getCanonicalLayers = function(){
var canonicalLayers = [];
for( var source in this.layers_by_source ){
if( _.includes( this.canonical_sources, source ) ){
canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) );
}
}
return canonicalLayers;
};
// load values from targets block // load values from targets block
TypeMapping.prototype.loadTargets = function( targetsBlock ){ TypeMapping.prototype.loadTargets = function( targetsBlock ){
@ -84,6 +105,7 @@ TypeMapping.prototype.loadTargets = function( targetsBlock ){
this.setSourceAliases( targetsBlock.source_aliases || {} ); this.setSourceAliases( targetsBlock.source_aliases || {} );
this.setLayersBySource( targetsBlock.layers_by_source || {} ); this.setLayersBySource( targetsBlock.layers_by_source || {} );
this.setLayerAliases( targetsBlock.layer_aliases || {} ); this.setLayerAliases( targetsBlock.layer_aliases || {} );
this.setCanonicalSources( targetsBlock.canonical_sources || [] );
// generate the mappings // generate the mappings
this.generateMappings(); this.generateMappings();

236
helper/diffPlaces.js

@ -1,176 +1,150 @@
var _ = require('lodash'); const _ = require('lodash');
var placeTypes = require('./placeTypes'); const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');
/** /**
* Compare the layer properties if they exist. * Compare the layer properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertLayerMatch(item1, item2) { function isLayerDifferent(item1, item2){
if (item1.layer === item2.layer) { if( isPropertyDifferent(item1, item2, 'layer') ){
return false; // consider all custom layers to be analogous to a venue
if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) &&
( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){
return false;
}
return true;
} }
return false;
throw new Error('different');
} }
/** /**
* Compare the parent.*_id properties if they exist. * Compare the parent properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertParentHierarchyMatch(item1, item2) { function isParentHierarchyDifferent(item1, item2){
// if neither object has parent, assume same let parent1 = _.get(item1, 'parent');
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) { let parent2 = _.get(item2, 'parent');
return false;
}
// if both have parent, do the rest of the checking // check if these are plain 'ol javascript objects
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { let isPojo1 = _.isPlainObject(parent1);
placeTypes.forEach(function (placeType) { let isPojo2 = _.isPlainObject(parent2);
// don't consider its own id
if (placeType === item1.layer) { // if neither object has parent info, we consider them the same
return; if( !isPojo1 && !isPojo2 ){ return false; }
}
propMatch(item1.parent, item2.parent, placeType + '_id'); // if only one has parent info, we consider them the same
}); // note: this really shouldn't happen as at least on parent should exist
return false; if( !isPojo1 || !isPojo2 ){ return false; }
}
// else both have parent info
// iterate over all the placetypes, comparing between items
return placeTypes.some( placeType => {
// if one has parent and the other doesn't consider different // skip the parent field corresponding to the item placetype
throw new Error('different'); if( placeType === item1.layer ){ return false; }
// ensure the parent ids are the same for all placetypes
return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' );
});
} }
/** /**
* Compare the name.* properties if they exist. * Compare the name properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertNameMatch(item1, item2) { function isNameDifferent(item1, item2){
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { let names1 = _.get(item1, 'name');
for (var lang in item1.name) { let names2 = _.get(item2, 'name');
if(item2.name.hasOwnProperty(lang) || lang === 'default') {
// do not consider absence of an additional name as a difference // check if these are plain 'ol javascript objects
propMatch(item1.name, item2.name, lang); let isPojo1 = _.isPlainObject(names1);
} let isPojo2 = _.isPlainObject(names2);
// if neither object has name info, we consider them the same
if( !isPojo1 && !isPojo2 ){ return false; }
// if only one has name info, we consider them the same
// note: this really shouldn't happen as name is a mandatory field
if( !isPojo1 || !isPojo2 ){ return false; }
// else both have name info
// iterate over all the languages in item1, comparing between items
return Object.keys(names1).some( lang => {
// do not consider absence of an additional name as a difference
// but strictly enfore that 'default' must be present and match
if( _.has(names2, lang) || lang === 'default' ){
// do not consider absence of an additional name as a difference
return isPropertyDifferent(names1, names2, lang);
} }
} });
else {
propMatch(item1, item2, 'name');
}
} }
/** /**
* Compare the address_parts properties if they exist. * Compare the address_parts properties if they exist.
* Returns false if the objects are the same, and throws * Returns false if the objects are the same, else true.
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function assertAddressMatch(item1, item2) { function isAddressDifferent(item1, item2){
// if neither record has address, assume same let address1 = _.get(item1, 'address_parts');
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) { let address2 = _.get(item2, 'address_parts');
return false;
}
// if both have address, check parts // check if these are plain 'ol javascript objects
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) { let isPojo1 = _.isPlainObject(address1);
propMatch(item1.address_parts, item2.address_parts, 'number'); let isPojo2 = _.isPlainObject(address2);
propMatch(item1.address_parts, item2.address_parts, 'street');
// only compare zip if both records have it, otherwise just ignore and assume it's the same // if neither object has address info, we consider them the same
// since by this time we've already compared parent hierarchies if( !isPojo1 && !isPojo2 ){ return false; }
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) {
propMatch(item1.address_parts, item2.address_parts, 'zip'); // if only one has address info, we consider them the same
} if( !isPojo1 || !isPojo2 ){ return false; }
return false; // else both have address info
if( isPropertyDifferent(address1, address2, 'number') ){ return true; }
if( isPropertyDifferent(address1, address2, 'street') ){ return true; }
// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; }
} }
// one has address and the other doesn't, different! return false;
throw new Error('different');
} }
/** /**
* Compare the two records and return true if they differ and false if same. * Compare the two records and return true if they differ and false if same.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/ */
function isDifferent(item1, item2) { function isDifferent(item1, item2){
try { if( isLayerDifferent( item1, item2 ) ){ return true; }
assertLayerMatch(item1, item2); if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
assertParentHierarchyMatch(item1, item2); if( isNameDifferent( item1, item2 ) ){ return true; }
assertNameMatch(item1, item2); if( isAddressDifferent( item1, item2 ) ){ return true; }
assertAddressMatch(item1, item2);
}
catch (err) {
if (err.message === 'different') {
return true;
}
throw err;
}
return false; return false;
} }
/** /**
* Throw exception if properties are different * return true if properties are different
*
* @param {object} item1
* @param {object} item2
* @param {string} prop
* @throws {Error}
*/ */
function propMatch(item1, item2, prop) { function isPropertyDifferent(item1, item2, prop ){
var prop1 = item1[prop];
var prop2 = item2[prop];
// in the case the property is an array (currently only in parent schema) // if neither item has prop, we consider them the same
// simply take the 1st item. this will change in the near future to support multiple hierarchies if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }
if (_.isArray(prop1)) { prop1 = prop1[0]; }
if (_.isArray(prop2)) { prop2 = prop2[0]; }
if (normalizeString(prop1) !== normalizeString(prop2)) { // handle arrays and other non-string values
throw new Error('different'); var prop1 = field.getStringValue( _.get( item1, prop ) );
} var prop2 = field.getStringValue( _.get( item2, prop ) );
// compare strings
return normalizeString(prop1) !== normalizeString(prop2);
} }
/** /**
* Remove punctuation and lowercase * lowercase characters and remove some punctuation
*
* @param {string} str
* @returns {string}
*/ */
function normalizeString(str) { function normalizeString(str){
if (!_.isString(str)) {
return str;
}
if (_.isEmpty(str)) {
return '';
}
return str.toLowerCase().split(/[ ,-]+/).join(' '); return str.toLowerCase().split(/[ ,-]+/).join(' ');
} }

138
middleware/dedupe.js

@ -1,89 +1,99 @@
const logger = require('pelias-logger').get('api'); const logger = require('pelias-logger').get('api');
const _ = require('lodash'); const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent; const isDifferent = require('../helper/diffPlaces').isDifferent;
const canonical_sources = require('../helper/type_mapping').canonical_sources;
const field = require('../helper/fieldValue'); const field = require('../helper/fieldValue');
function setup() {
return dedupeResults;
}
function dedupeResults(req, res, next) { function dedupeResults(req, res, next) {
// do nothing if no result data set
if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) {
return next();
}
// loop through data items and only copy unique items to uniqueResults // do nothing if request data is invalid
var uniqueResults = []; if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); }
// do nothing if no result data is invalid
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }
// loop through data items and only copy unique items to unique
// note: the first results must always be unique!
let unique = [ res.data[0] ];
// convenience function to search unique array for an existing element which matches a hit
let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit));
// iterate over res.data using an old-school for loop starting at index 1
// we can call break at any time to end the iterator
for( let i=1; i<res.data.length; i++){
let hit = res.data[i];
// if there are multiple items in results, loop through them to find a dupe
// save off the index of the dupe if found
let dupeIndex = findMatch(hit);
// if a dupe is not found, just add to list of unique hits and continue
if( dupeIndex === -1 ){ unique.push(hit); }
_.some(res.data, function (hit) { // if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new
else if( isPreferred( unique[dupeIndex], hit ) ) {
if (_.isEmpty(uniqueResults)) { // replace previous dupe item with current hit
uniqueResults.push(hit); unique[dupeIndex] = hit;
// logging
logger.debug('[dupe][replacing]', {
query: req.clean.text,
previous: unique[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
} }
// if not preferred over existing, just log and continue
else { else {
// if there are multiple items in results, loop through them to find a dupe logger.debug('[dupe][skipping]', {
// save off the index of the dupe if found query: req.clean.text,
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) { previous: unique[dupeIndex].source,
return !isDifferent(elem, hit); hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
}); });
// if a dupe is not found, just add to results and move on
if (dupeIndex === -1) {
uniqueResults.push(hit);
}
// if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new
else if (isPreferred(uniqueResults[dupeIndex], hit)) {
logger.debug('[dupe][replacing]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
// replace previous dupe item with current hit
uniqueResults[dupeIndex] = hit;
}
// if not preferred over existing, just log and move on
else {
logger.debug('[dupe][skipping]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id
});
}
} }
// stop looping when requested size has been reached in uniqueResults // stop iterating when requested size has been reached in unique
return req.clean.size <= uniqueResults.length; if( unique.length >= req.clean.size ){ break; }
}); }
res.data = uniqueResults; // replace the original data with only the unique hits
res.data = unique;
next(); next();
} }
function isPreferred(existing, candidateReplacement) { // return true if the second argument represents a hit which is preferred
// NOTE: we are assuming here that the layer for both records is the same // to the hit in the first argument
const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip'); function isPreferred(existingHit, candidateHit) {
// prefer a record with a postcode
// https://github.com/pelias/api/issues/872 // https://github.com/pelias/api/issues/872
const candidateHasZip = hasZip(candidateReplacement); if( !_.has(existingHit, 'address_parts.zip') &&
const existingHasZip = hasZip(existing); _.has(candidateHit, 'address_parts.zip') ){ return true; }
if (candidateHasZip !== existingHasZip) {
return candidateHasZip;
}
//bind the trumps function to the data items to keep the rest of the function clean
var trumpsFunc = trumps.bind(null, existing, candidateReplacement);
return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred // prefer non-canonical sources over canonical ones
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA if( !_.includes(canonical_sources, candidateHit.source) &&
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time _.includes(canonical_sources, existingHit.source) ){ return true; }
}
function trumps(existing, candidateReplacement, loserSource, winnerSource) { // prefer certain sources over others
return existing.source === loserSource && candidateReplacement.source === winnerSource; switch( existingHit.source ){
// sources are the same
case candidateHit.source: return false;
// WOF has bbox and is generally preferred
case 'geonames': return candidateHit.source === 'whosonfirst';
// addresses are generally better in OA
case 'openstreetmap': return candidateHit.source === 'openaddresses';
// venues are better in OSM
case 'whosonfirst': return candidateHit.source === 'openstreetmap';
// no preference, keep existing hit
default: return false;
}
} }
module.exports = setup; module.exports = function() {
return dedupeResults;
};

2
package.json

@ -52,7 +52,7 @@
"markdown": "^0.5.0", "markdown": "^0.5.0",
"morgan": "^1.8.2", "morgan": "^1.8.2",
"pelias-categories": "^1.2.0", "pelias-categories": "^1.2.0",
"pelias-config": "^3.0.2", "pelias-config": "^3.7.0",
"pelias-labels": "^1.8.0", "pelias-labels": "^1.8.0",
"pelias-logger": "^1.2.0", "pelias-logger": "^1.2.0",
"pelias-microservice-wrapper": "^1.7.0", "pelias-microservice-wrapper": "^1.7.0",

32
test/unit/fixture/dedupe_elasticsearch_custom_layer_results.js

@ -0,0 +1,32 @@
module.exports = [
{
'_id': '101914069',
'layer': 'venue',
'source': 'openstreetmap',
'name': {
'default': 'Nike World Headquarters'
},
'parent': {
'country_a': ['USA'],
'country': ['United States'],
'region': ['Oregon'],
'region_id': ['85688513']
},
'confidence': 0.98
},
{
'_id': '2456::trimet::major_employer',
'layer': 'major_employer',
'source': 'transit',
'name': {
'default': 'Nike World Headquarters'
},
'parent': {
'country_a': ['USA'],
'country': ['United States'],
'region': ['Oregon'],
'region_id': ['85688513']
},
'confidence': 0.50
}
];

27
test/unit/fixture/dedupe_only_postalcode_differs.js

@ -0,0 +1,27 @@
module.exports = [
{
'_id': '101914069',
'layer': 'venue',
'source': 'openstreetmap',
'name': {
'default': 'A place'
},
'parent': {
'country_a': ['USA']
}
},
{
'_id': '323',
'layer': 'venue',
'source': 'openstreetmap',
'name': {
'default': 'A place'
},
'address_parts': {
'zip': '97005'
},
'parent': {
'country_a': ['USA']
}
}
];

53
test/unit/middleware/dedupe.js

@ -1,5 +1,7 @@
var data = require('../fixture/dedupe_elasticsearch_results'); var data = require('../fixture/dedupe_elasticsearch_results');
var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results'); var nonAsciiData = require('../fixture/dedupe_elasticsearch_nonascii_results');
var customLayerData = require('../fixture/dedupe_elasticsearch_custom_layer_results');
var onlyPostalcodeDiffersData = require('../fixture/dedupe_only_postalcode_differs');
var dedupe = require('../../../middleware/dedupe')(); var dedupe = require('../../../middleware/dedupe')();
module.exports.tests = {}; module.exports.tests = {};
@ -56,10 +58,47 @@ module.exports.tests.dedupe = function(test, common) {
t.end(); t.end();
}); });
}); });
test('deduplicate between custom layers and venue layers', function(t) {
var req = {
clean: {
size: 20
}
};
var res = {
data: customLayerData
};
var expected = customLayerData[1]; // non-canonical record
dedupe(req, res, function () {
t.equal(res.data.length, 1, 'only one result displayed');
t.equal(res.data[0], expected, 'non-canonical data is preferred');
t.end();
});
});
test('test records with no address except one has postalcode', function(t) {
var req = {
clean: {
size: 20
}
};
var res = {
data: onlyPostalcodeDiffersData
};
var expected = onlyPostalcodeDiffersData[1]; // record with postcode
dedupe(req, res, function () {
t.equal(res.data.length, 1, 'only one result displayed');
t.equal(res.data[0], expected, 'record with postalcode is preferred');
t.end();
});
});
}; };
module.exports.tests.trump = function(test, common) {
test('whosonfirst trumps geonames, replace', function (t) { module.exports.tests.priority = function(test, common) {
test('whosonfirst takes priority over geonames, replace', function (t) {
var req = { var req = {
clean: { clean: {
text: 'Lancaster', text: 'Lancaster',
@ -91,7 +130,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('whosonfirst trumps geonames, no replace', function (t) { test('whosonfirst takes priority over geonames, no replace', function (t) {
var req = { var req = {
clean: { clean: {
text: 'Lancaster', text: 'Lancaster',
@ -123,7 +162,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('openstreetmap trumps whosonfirst venues', function (t) { test('openstreetmap takes priority over whosonfirst venues', function (t) {
var req = { var req = {
clean: { clean: {
text: 'Lancaster Dairy Farm', text: 'Lancaster Dairy Farm',
@ -155,7 +194,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('openaddresses trumps openstreetmap', function (t) { test('openaddresses takes priority over openstreetmap', function (t) {
var req = { var req = {
clean: { clean: {
text: '100 Main St', text: '100 Main St',
@ -187,7 +226,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('openaddresses with zip trumps openaddresses without zip', function (t) { test('openaddresses with zip takes priority over openaddresses without zip', function (t) {
var req = { var req = {
clean: { clean: {
text: '100 Main St', text: '100 Main St',
@ -223,7 +262,7 @@ module.exports.tests.trump = function(test, common) {
}); });
}); });
test('osm with zip trumps openaddresses without zip', function (t) { test('osm with zip takes priority over openaddresses without zip', function (t) {
var req = { var req = {
clean: { clean: {
text: '100 Main St', text: '100 Main St',

Loading…
Cancel
Save