mirror of https://github.com/pelias/api.git
Julian Simioni
6 years ago
committed by
GitHub
7 changed files with 307 additions and 203 deletions
@ -1,176 +1,150 @@ |
|||||||
var _ = require('lodash'); |
const _ = require('lodash'); |
||||||
var placeTypes = require('./placeTypes'); |
const placeTypes = require('./placeTypes'); |
||||||
|
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers(); |
||||||
|
const field = require('../helper/fieldValue'); |
||||||
|
|
||||||
/** |
/** |
||||||
* Compare the layer properties if they exist. |
* Compare the layer properties if they exist. |
||||||
* Returns false if the objects are the same, and throws |
* Returns false if the objects are the same, else true. |
||||||
* an exception with the message 'different' if not. |
|
||||||
* |
|
||||||
* @param {object} item1 |
|
||||||
* @param {object} item2 |
|
||||||
* @returns {boolean} |
|
||||||
* @throws {Error} |
|
||||||
*/ |
*/ |
||||||
function assertLayerMatch(item1, item2) { |
function isLayerDifferent(item1, item2){ |
||||||
if (item1.layer === item2.layer) { |
if( isPropertyDifferent(item1, item2, 'layer') ){ |
||||||
return false; |
// consider all custom layers to be analogous to a venue
|
||||||
|
if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) && |
||||||
|
( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){ |
||||||
|
return false; |
||||||
|
} |
||||||
|
return true; |
||||||
} |
} |
||||||
|
return false; |
||||||
throw new Error('different'); |
|
||||||
} |
} |
||||||
|
|
||||||
/** |
/** |
||||||
* Compare the parent.*_id properties if they exist. |
* Compare the parent properties if they exist. |
||||||
* Returns false if the objects are the same, and throws |
* Returns false if the objects are the same, else true. |
||||||
* an exception with the message 'different' if not. |
|
||||||
* |
|
||||||
* @param {object} item1 |
|
||||||
* @param {object} item2 |
|
||||||
* @returns {boolean} |
|
||||||
* @throws {Error} |
|
||||||
*/ |
*/ |
||||||
function assertParentHierarchyMatch(item1, item2) { |
function isParentHierarchyDifferent(item1, item2){ |
||||||
// if neither object has parent, assume same
|
let parent1 = _.get(item1, 'parent'); |
||||||
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) { |
let parent2 = _.get(item2, 'parent'); |
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
// if both have parent, do the rest of the checking
|
// check if these are plain 'ol javascript objects
|
||||||
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) { |
let isPojo1 = _.isPlainObject(parent1); |
||||||
placeTypes.forEach(function (placeType) { |
let isPojo2 = _.isPlainObject(parent2); |
||||||
// don't consider its own id
|
|
||||||
if (placeType === item1.layer) { |
// if neither object has parent info, we consider them the same
|
||||||
return; |
if( !isPojo1 && !isPojo2 ){ return false; } |
||||||
} |
|
||||||
propMatch(item1.parent, item2.parent, placeType + '_id'); |
// if only one has parent info, we consider them the same
|
||||||
}); |
// note: this really shouldn't happen as at least on parent should exist
|
||||||
return false; |
if( !isPojo1 || !isPojo2 ){ return false; } |
||||||
} |
|
||||||
|
// else both have parent info
|
||||||
|
// iterate over all the placetypes, comparing between items
|
||||||
|
return placeTypes.some( placeType => { |
||||||
|
|
||||||
// if one has parent and the other doesn't consider different
|
// skip the parent field corresponding to the item placetype
|
||||||
throw new Error('different'); |
if( placeType === item1.layer ){ return false; } |
||||||
|
|
||||||
|
// ensure the parent ids are the same for all placetypes
|
||||||
|
return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' ); |
||||||
|
}); |
||||||
} |
} |
||||||
|
|
||||||
/** |
/** |
||||||
* Compare the name.* properties if they exist. |
* Compare the name properties if they exist. |
||||||
* Returns false if the objects are the same, and throws |
* Returns false if the objects are the same, else true. |
||||||
* an exception with the message 'different' if not. |
|
||||||
* |
|
||||||
* @param {object} item1 |
|
||||||
* @param {object} item2 |
|
||||||
* @returns {boolean} |
|
||||||
* @throws {Error} |
|
||||||
*/ |
*/ |
||||||
function assertNameMatch(item1, item2) { |
function isNameDifferent(item1, item2){ |
||||||
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) { |
let names1 = _.get(item1, 'name'); |
||||||
for (var lang in item1.name) { |
let names2 = _.get(item2, 'name'); |
||||||
if(item2.name.hasOwnProperty(lang) || lang === 'default') { |
|
||||||
// do not consider absence of an additional name as a difference
|
// check if these are plain 'ol javascript objects
|
||||||
propMatch(item1.name, item2.name, lang); |
let isPojo1 = _.isPlainObject(names1); |
||||||
} |
let isPojo2 = _.isPlainObject(names2); |
||||||
|
|
||||||
|
// if neither object has name info, we consider them the same
|
||||||
|
if( !isPojo1 && !isPojo2 ){ return false; } |
||||||
|
|
||||||
|
// if only one has name info, we consider them the same
|
||||||
|
// note: this really shouldn't happen as name is a mandatory field
|
||||||
|
if( !isPojo1 || !isPojo2 ){ return false; } |
||||||
|
|
||||||
|
// else both have name info
|
||||||
|
// iterate over all the languages in item1, comparing between items
|
||||||
|
return Object.keys(names1).some( lang => { |
||||||
|
|
||||||
|
// do not consider absence of an additional name as a difference
|
||||||
|
// but strictly enfore that 'default' must be present and match
|
||||||
|
if( _.has(names2, lang) || lang === 'default' ){ |
||||||
|
|
||||||
|
// do not consider absence of an additional name as a difference
|
||||||
|
return isPropertyDifferent(names1, names2, lang); |
||||||
} |
} |
||||||
} |
}); |
||||||
else { |
|
||||||
propMatch(item1, item2, 'name'); |
|
||||||
} |
|
||||||
} |
} |
||||||
|
|
||||||
/** |
/** |
||||||
* Compare the address_parts properties if they exist. |
* Compare the address_parts properties if they exist. |
||||||
* Returns false if the objects are the same, and throws |
* Returns false if the objects are the same, else true. |
||||||
* an exception with the message 'different' if not. |
|
||||||
* |
|
||||||
* @param {object} item1 |
|
||||||
* @param {object} item2 |
|
||||||
* @returns {boolean} |
|
||||||
* @throws {Error} |
|
||||||
*/ |
*/ |
||||||
function assertAddressMatch(item1, item2) { |
function isAddressDifferent(item1, item2){ |
||||||
// if neither record has address, assume same
|
let address1 = _.get(item1, 'address_parts'); |
||||||
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) { |
let address2 = _.get(item2, 'address_parts'); |
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
// if both have address, check parts
|
// check if these are plain 'ol javascript objects
|
||||||
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) { |
let isPojo1 = _.isPlainObject(address1); |
||||||
propMatch(item1.address_parts, item2.address_parts, 'number'); |
let isPojo2 = _.isPlainObject(address2); |
||||||
propMatch(item1.address_parts, item2.address_parts, 'street'); |
|
||||||
|
|
||||||
// only compare zip if both records have it, otherwise just ignore and assume it's the same
|
// if neither object has address info, we consider them the same
|
||||||
// since by this time we've already compared parent hierarchies
|
if( !isPojo1 && !isPojo2 ){ return false; } |
||||||
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) { |
|
||||||
propMatch(item1.address_parts, item2.address_parts, 'zip'); |
// if only one has address info, we consider them the same
|
||||||
} |
if( !isPojo1 || !isPojo2 ){ return false; } |
||||||
|
|
||||||
return false; |
// else both have address info
|
||||||
|
if( isPropertyDifferent(address1, address2, 'number') ){ return true; } |
||||||
|
if( isPropertyDifferent(address1, address2, 'street') ){ return true; } |
||||||
|
|
||||||
|
// only compare zip if both records have it, otherwise just ignore and assume it's the same
|
||||||
|
// since by this time we've already compared parent hierarchies
|
||||||
|
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){ |
||||||
|
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; } |
||||||
} |
} |
||||||
|
|
||||||
// one has address and the other doesn't, different!
|
return false; |
||||||
throw new Error('different'); |
|
||||||
} |
} |
||||||
|
|
||||||
/** |
/** |
||||||
* Compare the two records and return true if they differ and false if same. |
* Compare the two records and return true if they differ and false if same. |
||||||
* |
|
||||||
* @param {object} item1 |
|
||||||
* @param {object} item2 |
|
||||||
* @returns {boolean} |
|
||||||
* @throws {Error} |
|
||||||
*/ |
*/ |
||||||
function isDifferent(item1, item2) { |
function isDifferent(item1, item2){ |
||||||
try { |
if( isLayerDifferent( item1, item2 ) ){ return true; } |
||||||
assertLayerMatch(item1, item2); |
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; } |
||||||
assertParentHierarchyMatch(item1, item2); |
if( isNameDifferent( item1, item2 ) ){ return true; } |
||||||
assertNameMatch(item1, item2); |
if( isAddressDifferent( item1, item2 ) ){ return true; } |
||||||
assertAddressMatch(item1, item2); |
|
||||||
} |
|
||||||
catch (err) { |
|
||||||
if (err.message === 'different') { |
|
||||||
return true; |
|
||||||
} |
|
||||||
throw err; |
|
||||||
} |
|
||||||
|
|
||||||
return false; |
return false; |
||||||
} |
} |
||||||
|
|
||||||
/** |
/** |
||||||
* Throw exception if properties are different |
* return true if properties are different |
||||||
* |
|
||||||
* @param {object} item1 |
|
||||||
* @param {object} item2 |
|
||||||
* @param {string} prop |
|
||||||
* @throws {Error} |
|
||||||
*/ |
*/ |
||||||
function propMatch(item1, item2, prop) { |
function isPropertyDifferent(item1, item2, prop ){ |
||||||
var prop1 = item1[prop]; |
|
||||||
var prop2 = item2[prop]; |
|
||||||
|
|
||||||
// in the case the property is an array (currently only in parent schema)
|
// if neither item has prop, we consider them the same
|
||||||
// simply take the 1st item. this will change in the near future to support multiple hierarchies
|
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; } |
||||||
if (_.isArray(prop1)) { prop1 = prop1[0]; } |
|
||||||
if (_.isArray(prop2)) { prop2 = prop2[0]; } |
|
||||||
|
|
||||||
if (normalizeString(prop1) !== normalizeString(prop2)) { |
// handle arrays and other non-string values
|
||||||
throw new Error('different'); |
var prop1 = field.getStringValue( _.get( item1, prop ) ); |
||||||
} |
var prop2 = field.getStringValue( _.get( item2, prop ) ); |
||||||
|
|
||||||
|
// compare strings
|
||||||
|
return normalizeString(prop1) !== normalizeString(prop2); |
||||||
} |
} |
||||||
|
|
||||||
/** |
/** |
||||||
* Remove punctuation and lowercase |
* lowercase characters and remove some punctuation |
||||||
* |
|
||||||
* @param {string} str |
|
||||||
* @returns {string} |
|
||||||
*/ |
*/ |
||||||
function normalizeString(str) { |
function normalizeString(str){ |
||||||
if (!_.isString(str)) { |
|
||||||
return str; |
|
||||||
} |
|
||||||
|
|
||||||
if (_.isEmpty(str)) { |
|
||||||
return ''; |
|
||||||
} |
|
||||||
|
|
||||||
return str.toLowerCase().split(/[ ,-]+/).join(' '); |
return str.toLowerCase().split(/[ ,-]+/).join(' '); |
||||||
} |
} |
||||||
|
|
||||||
|
@ -1,89 +1,99 @@ |
|||||||
const logger = require('pelias-logger').get('api'); |
const logger = require('pelias-logger').get('api'); |
||||||
const _ = require('lodash'); |
const _ = require('lodash'); |
||||||
const isDifferent = require('../helper/diffPlaces').isDifferent; |
const isDifferent = require('../helper/diffPlaces').isDifferent; |
||||||
|
const canonical_sources = require('../helper/type_mapping').canonical_sources; |
||||||
const field = require('../helper/fieldValue'); |
const field = require('../helper/fieldValue'); |
||||||
|
|
||||||
function setup() { |
|
||||||
return dedupeResults; |
|
||||||
} |
|
||||||
|
|
||||||
function dedupeResults(req, res, next) { |
function dedupeResults(req, res, next) { |
||||||
// do nothing if no result data set
|
|
||||||
if (_.isUndefined(req.clean) || _.isUndefined(res) || _.isUndefined(res.data)) { |
|
||||||
return next(); |
|
||||||
} |
|
||||||
|
|
||||||
// loop through data items and only copy unique items to uniqueResults
|
// do nothing if request data is invalid
|
||||||
var uniqueResults = []; |
if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); } |
||||||
|
|
||||||
|
// do nothing if no result data is invalid
|
||||||
|
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); } |
||||||
|
|
||||||
|
// loop through data items and only copy unique items to unique
|
||||||
|
// note: the first results must always be unique!
|
||||||
|
let unique = [ res.data[0] ]; |
||||||
|
|
||||||
|
// convenience function to search unique array for an existing element which matches a hit
|
||||||
|
let findMatch = (hit) => unique.findIndex(elem => !isDifferent(elem, hit)); |
||||||
|
|
||||||
|
// iterate over res.data using an old-school for loop starting at index 1
|
||||||
|
// we can call break at any time to end the iterator
|
||||||
|
for( let i=1; i<res.data.length; i++){ |
||||||
|
let hit = res.data[i]; |
||||||
|
|
||||||
|
// if there are multiple items in results, loop through them to find a dupe
|
||||||
|
// save off the index of the dupe if found
|
||||||
|
let dupeIndex = findMatch(hit); |
||||||
|
|
||||||
|
// if a dupe is not found, just add to list of unique hits and continue
|
||||||
|
if( dupeIndex === -1 ){ unique.push(hit); } |
||||||
|
|
||||||
_.some(res.data, function (hit) { |
// if dupe was found, we need to check which of the records is preferred
|
||||||
|
// since the order in which Elasticsearch returns identical text matches is arbitrary
|
||||||
|
// of course, if the new one is preferred we should replace previous with new
|
||||||
|
else if( isPreferred( unique[dupeIndex], hit ) ) { |
||||||
|
|
||||||
if (_.isEmpty(uniqueResults)) { |
// replace previous dupe item with current hit
|
||||||
uniqueResults.push(hit); |
unique[dupeIndex] = hit; |
||||||
|
|
||||||
|
// logging
|
||||||
|
logger.debug('[dupe][replacing]', { |
||||||
|
query: req.clean.text, |
||||||
|
previous: unique[dupeIndex].source, |
||||||
|
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
||||||
|
}); |
||||||
} |
} |
||||||
|
|
||||||
|
// if not preferred over existing, just log and continue
|
||||||
else { |
else { |
||||||
// if there are multiple items in results, loop through them to find a dupe
|
logger.debug('[dupe][skipping]', { |
||||||
// save off the index of the dupe if found
|
query: req.clean.text, |
||||||
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) { |
previous: unique[dupeIndex].source, |
||||||
return !isDifferent(elem, hit); |
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
||||||
}); |
}); |
||||||
|
|
||||||
// if a dupe is not found, just add to results and move on
|
|
||||||
if (dupeIndex === -1) { |
|
||||||
uniqueResults.push(hit); |
|
||||||
} |
|
||||||
// if dupe was found, we need to check which of the records is preferred
|
|
||||||
// since the order in which Elasticsearch returns identical text matches is arbitrary
|
|
||||||
// of course, if the new one is preferred we should replace previous with new
|
|
||||||
else if (isPreferred(uniqueResults[dupeIndex], hit)) { |
|
||||||
logger.debug('[dupe][replacing]', { |
|
||||||
query: req.clean.text, |
|
||||||
previous: uniqueResults[dupeIndex].source, |
|
||||||
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
|
||||||
}); |
|
||||||
// replace previous dupe item with current hit
|
|
||||||
uniqueResults[dupeIndex] = hit; |
|
||||||
} |
|
||||||
// if not preferred over existing, just log and move on
|
|
||||||
else { |
|
||||||
logger.debug('[dupe][skipping]', { |
|
||||||
query: req.clean.text, |
|
||||||
previous: uniqueResults[dupeIndex].source, |
|
||||||
hit: field.getStringValue(hit.name.default) + ' ' + hit.source + ':' + hit._id |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
} |
||||||
|
|
||||||
// stop looping when requested size has been reached in uniqueResults
|
// stop iterating when requested size has been reached in unique
|
||||||
return req.clean.size <= uniqueResults.length; |
if( unique.length >= req.clean.size ){ break; } |
||||||
}); |
} |
||||||
|
|
||||||
res.data = uniqueResults; |
// replace the original data with only the unique hits
|
||||||
|
res.data = unique; |
||||||
|
|
||||||
next(); |
next(); |
||||||
} |
} |
||||||
|
|
||||||
function isPreferred(existing, candidateReplacement) { |
// return true if the second argument represents a hit which is preferred
|
||||||
// NOTE: we are assuming here that the layer for both records is the same
|
// to the hit in the first argument
|
||||||
const hasZip = _.bind(_.has, null, _.bind.placeholder, 'address_parts.zip'); |
function isPreferred(existingHit, candidateHit) { |
||||||
|
|
||||||
|
// prefer a record with a postcode
|
||||||
// https://github.com/pelias/api/issues/872
|
// https://github.com/pelias/api/issues/872
|
||||||
const candidateHasZip = hasZip(candidateReplacement); |
if( !_.has(existingHit, 'address_parts.zip') && |
||||||
const existingHasZip = hasZip(existing); |
_.has(candidateHit, 'address_parts.zip') ){ return true; } |
||||||
if (candidateHasZip !== existingHasZip) { |
|
||||||
return candidateHasZip; |
|
||||||
} |
|
||||||
|
|
||||||
//bind the trumps function to the data items to keep the rest of the function clean
|
|
||||||
var trumpsFunc = trumps.bind(null, existing, candidateReplacement); |
|
||||||
|
|
||||||
return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred
|
// prefer non-canonical sources over canonical ones
|
||||||
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA
|
if( !_.includes(canonical_sources, candidateHit.source) && |
||||||
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time
|
_.includes(canonical_sources, existingHit.source) ){ return true; } |
||||||
} |
|
||||||
|
|
||||||
function trumps(existing, candidateReplacement, loserSource, winnerSource) { |
// prefer certain sources over others
|
||||||
return existing.source === loserSource && candidateReplacement.source === winnerSource; |
switch( existingHit.source ){ |
||||||
|
// sources are the same
|
||||||
|
case candidateHit.source: return false; |
||||||
|
// WOF has bbox and is generally preferred
|
||||||
|
case 'geonames': return candidateHit.source === 'whosonfirst'; |
||||||
|
// addresses are generally better in OA
|
||||||
|
case 'openstreetmap': return candidateHit.source === 'openaddresses'; |
||||||
|
// venues are better in OSM
|
||||||
|
case 'whosonfirst': return candidateHit.source === 'openstreetmap'; |
||||||
|
// no preference, keep existing hit
|
||||||
|
default: return false; |
||||||
|
} |
||||||
} |
} |
||||||
|
|
||||||
module.exports = setup; |
module.exports = function() { |
||||||
|
return dedupeResults; |
||||||
|
}; |
||||||
|
@ -0,0 +1,32 @@ |
|||||||
|
module.exports = [ |
||||||
|
{ |
||||||
|
'_id': '101914069', |
||||||
|
'layer': 'venue', |
||||||
|
'source': 'openstreetmap', |
||||||
|
'name': { |
||||||
|
'default': 'Nike World Headquarters' |
||||||
|
}, |
||||||
|
'parent': { |
||||||
|
'country_a': ['USA'], |
||||||
|
'country': ['United States'], |
||||||
|
'region': ['Oregon'], |
||||||
|
'region_id': ['85688513'] |
||||||
|
}, |
||||||
|
'confidence': 0.98 |
||||||
|
}, |
||||||
|
{ |
||||||
|
'_id': '2456::trimet::major_employer', |
||||||
|
'layer': 'major_employer', |
||||||
|
'source': 'transit', |
||||||
|
'name': { |
||||||
|
'default': 'Nike World Headquarters' |
||||||
|
}, |
||||||
|
'parent': { |
||||||
|
'country_a': ['USA'], |
||||||
|
'country': ['United States'], |
||||||
|
'region': ['Oregon'], |
||||||
|
'region_id': ['85688513'] |
||||||
|
}, |
||||||
|
'confidence': 0.50 |
||||||
|
} |
||||||
|
]; |
@ -0,0 +1,27 @@ |
|||||||
|
module.exports = [ |
||||||
|
{ |
||||||
|
'_id': '101914069', |
||||||
|
'layer': 'venue', |
||||||
|
'source': 'openstreetmap', |
||||||
|
'name': { |
||||||
|
'default': 'A place' |
||||||
|
}, |
||||||
|
'parent': { |
||||||
|
'country_a': ['USA'] |
||||||
|
} |
||||||
|
}, |
||||||
|
{ |
||||||
|
'_id': '323', |
||||||
|
'layer': 'venue', |
||||||
|
'source': 'openstreetmap', |
||||||
|
'name': { |
||||||
|
'default': 'A place' |
||||||
|
}, |
||||||
|
'address_parts': { |
||||||
|
'zip': '97005' |
||||||
|
}, |
||||||
|
'parent': { |
||||||
|
'country_a': ['USA'] |
||||||
|
} |
||||||
|
} |
||||||
|
]; |
Loading…
Reference in new issue